[llvm] 47bac63 - [AMDGPU] gfx940 memory model
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 14 15:02:05 PDT 2022
Author: Stanislav Mekhanoshin
Date: 2022-03-14T15:01:46-07:00
New Revision: 47bac63d3f6b9e64fdf997aff1f145bc948f02d9
URL: https://github.com/llvm/llvm-project/commit/47bac63d3f6b9e64fdf997aff1f145bc948f02d9
DIFF: https://github.com/llvm/llvm-project/commit/47bac63d3f6b9e64fdf997aff1f145bc948f02d9.diff
LOG: [AMDGPU] gfx940 memory model
Differential Revision: https://reviews.llvm.org/D121242
Added:
Modified:
llvm/docs/AMDGPUUsage.rst
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
Removed:
################################################################################
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index b3ad9c98b3098..539cda827a8d6 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5018,6 +5018,7 @@ following sections:
* :ref:`amdgpu-amdhsa-memory-model-gfx6-gfx9`
* :ref:`amdgpu-amdhsa-memory-model-gfx90a`
+* :ref:`amdgpu-amdhsa-memory-model-gfx940`
* :ref:`amdgpu-amdhsa-memory-model-gfx10`
.. _amdgpu-amdhsa-memory-model-gfx6-gfx9:
@@ -8669,6 +8670,2422 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`.
- system for OpenCL.*
============ ============ ============== ========== ================================
+.. _amdgpu-amdhsa-memory-model-gfx940:
+
+Memory Model GFX940
++++++++++++++++++++
+
+For GFX940:
+
+* Each agent has multiple shader arrays (SA).
+* Each SA has multiple compute units (CU).
+* Each CU has multiple SIMDs that execute wavefronts.
+* The wavefronts for a single work-group are executed in the same CU but may be
+ executed by
diff erent SIMDs. The exception is when in tgsplit execution mode
+ when the wavefronts may be executed by
diff erent SIMDs in
diff erent CUs.
+* Each CU has a single LDS memory shared by the wavefronts of the work-groups
+ executing on it. The exception is when in tgsplit execution mode when no LDS
+ is allocated as wavefronts of the same work-group can be in
diff erent CUs.
+* All LDS operations of a CU are performed as wavefront wide operations in a
+ global order and involve no caching. Completion is reported to a wavefront in
+ execution order.
+* The LDS memory has multiple request queues shared by the SIMDs of a
+ CU. Therefore, the LDS operations performed by
diff erent wavefronts of a
+ work-group can be reordered relative to each other, which can result in
+ reordering the visibility of vector memory operations with respect to LDS
+ operations of other wavefronts in the same work-group. A ``s_waitcnt
+ lgkmcnt(0)`` is required to ensure synchronization between LDS operations and
+ vector memory operations between wavefronts of a work-group, but not between
+ operations performed by the same wavefront.
+* The vector memory operations are performed as wavefront wide operations and
+ completion is reported to a wavefront in execution order. The exception is
+ that ``flat_load/store/atomic`` instructions can report out of vector memory
+ order if they access LDS memory, and out of LDS operation order if they access
+ global memory.
+* The vector memory operations access a single vector L1 cache shared by all
+ SIMDs a CU. Therefore:
+
+ * No special action is required for coherence between the lanes of a single
+ wavefront.
+
+ * No special action is required for coherence between wavefronts in the same
+ work-group since they execute on the same CU. The exception is when in
+ tgsplit execution mode as wavefronts of the same work-group can be in
+
diff erent CUs and so a ``buffer_inv sc0`` is required which will invalidate
+ the L1 cache is in tgsplit mode.
+
+ * A ``buffer_inv sc1`` is required to invalidate the L1 cache for coherence
+ between wavefronts executing in
diff erent work-groups as they may be
+ executing on
diff erent CUs.
+
+* The scalar memory operations access a scalar L1 cache shared by all wavefronts
+ on a group of CUs. The scalar and vector L1 caches are not coherent. However,
+ scalar operations are used in a restricted way so do not impact the memory
+ model. See :ref:`amdgpu-amdhsa-memory-spaces`.
+* The vector and scalar memory operations use an L2 cache.
+
+ * The gfx940 can be configured as a number of smaller agents with each having
+ a single L2 shared by all CUs on the same agent, or as fewer (possibly one)
+ larger agents with groups of CUs on each agent each sharing separate L2
+ caches.
+ * The L2 cache has independent channels to service disjoint ranges of virtual
+ addresses.
+ * Each CU has a separate request queue per channel for its associated L2.
+ Therefore, the vector and scalar memory operations performed by wavefronts
+ executing with
diff erent L1 caches and the same L2 cache can be reordered
+ relative to each other.
+ * A ``s_waitcnt vmcnt(0)`` is required to ensure synchronization between
+ vector memory operations of
diff erent CUs. It ensures a previous vector
+ memory operation has completed before executing a subsequent vector memory
+ or LDS operation and so can be used to meet the requirements of acquire and
+ release.
+ * An L2 cache can be kept coherent with other L2 caches by using the MTYPE RW
+ (read-write) for memory local to the L2, and MTYPE NC (non-coherent) with
+ the PTE C-bit set for memory not local to the L2.
+
+ * Any local memory cache lines will be automatically invalidated by writes
+ from CUs associated with other L2 caches, or writes from the CPU, due to
+ the cache probe caused by the PTE C-bit.
+ * XGMI accesses from the CPU to local memory may be cached on the CPU.
+ Subsequent access from the GPU will automatically invalidate or writeback
+ the CPU cache due to the L2 probe filter.
+ * To ensure coherence of local memory writes of CUs with
diff erent L1 caches
+ in the same agent a ``buffer_wbl2`` is required. It does nothing if the
+ agent is configured to have a single L2, or will writeback dirty L2 cache
+ lines if configured to have multiple L2 caches.
+ * To ensure coherence of local memory writes of CUs in
diff erent agents a
+ ``buffer_wbl2 sc1`` is required. It will writeback dirty L2 cache lines.
+ * To ensure coherence of local memory reads of CUs with
diff erent L1 caches
+ in the same agent a ``buffer_inv sc1`` is required. It does nothing if the
+ agent is configured to have a single L2, or will invalidate non-local L2
+ cache lines if configured to have multiple L2 caches.
+ * To ensure coherence of local memory reads of CUs in
diff erent agents a
+ ``buffer_inv sc0 sc1`` is required. It will invalidate non-local L2 cache
+ lines if configured to have multiple L2 caches.
+
+ * PCIe access from the GPU to the CPU can be kept coherent by using the MTYPE
+ UC (uncached) which bypasses the L2.
+
+Scalar memory operations are only used to access memory that is proven to not
+change during the execution of the kernel dispatch. This includes constant
+address space and global address space for program scope ``const`` variables.
+Therefore, the kernel machine code does not have to maintain the scalar cache to
+ensure it is coherent with the vector caches. The scalar and vector caches are
+invalidated between kernel dispatches by CP since constant address space data
+may change between kernel dispatch executions. See
+:ref:`amdgpu-amdhsa-memory-spaces`.
+
+The one exception is if scalar writes are used to spill SGPR registers. In this
+case the AMDGPU backend ensures the memory location used to spill is never
+accessed by vector memory operations at the same time. If scalar writes are used
+then a ``s_dcache_wb`` is inserted before the ``s_endpgm`` and before a function
+return since the locations may be used for vector memory instructions by a
+future wavefront that uses the same scratch area, or a function call that
+creates a frame at the same address, respectively. There is no need for a
+``s_dcache_inv`` as all scalar writes are write-before-read in the same thread.
+
+For kernarg backing memory:
+
+* CP invalidates the L1 cache at the start of each kernel dispatch.
+* On dGPU over XGMI or PCIe the kernarg backing memory is allocated in host
+ memory accessed as MTYPE UC (uncached) to avoid needing to invalidate the L2
+ cache. This also causes it to be treated as non-volatile and so is not
+ invalidated by ``*_vol``.
+* On APU the kernarg backing memory is accessed as MTYPE CC (cache coherent) and
+ so the L2 cache will be coherent with the CPU and other agents.
+
+Scratch backing memory (which is used for the private address space) is accessed
+with MTYPE NC_NV (non-coherent non-volatile). Since the private address space is
+only accessed by a single thread, and is always write-before-read, there is
+never a need to invalidate these entries from the L1 cache. Hence all cache
+invalidates are done as ``*_vol`` to only invalidate the volatile cache lines.
+
+The code sequences used to implement the memory model for GFX940 are defined
+in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-table`.
+
+ .. table:: AMDHSA Memory Model Code Sequences GFX940
+ :name: amdgpu-amdhsa-memory-model-code-sequences-gfx940-table
+
+ ============ ============ ============== ========== ================================
+ LLVM Instr LLVM Memory LLVM Memory AMDGPU AMDGPU Machine Code
+ Ordering Sync Scope Address GFX940
+ Space
+ ============ ============ ============== ========== ================================
+ **Non-Atomic**
+ ------------------------------------------------------------------------------------
+ load *none* *none* - global - !volatile & !nontemporal
+ - generic
+ - private 1. buffer/global/flat_load
+ - constant
+ - !volatile & nontemporal
+
+ 1. buffer/global/flat_load
+ nt=1
+
+ - volatile
+
+ 1. buffer/global/flat_load
+ sc0=1 sc1=1
+ 2. s_waitcnt vmcnt(0)
+
+ - Must happen before
+ any following volatile
+ global/generic
+ load/store.
+ - Ensures that
+ volatile
+ operations to
+
diff erent
+ addresses will not
+ be reordered by
+ hardware.
+
+ load *none* *none* - local 1. ds_load
+ store *none* *none* - global - !volatile & !nontemporal
+ - generic
+ - private 1. buffer/global/flat_store
+ - constant
+ - !volatile & nontemporal
+
+ 1. buffer/global/flat_store
+ nt=1
+
+ - volatile
+
+ 1. buffer/global/flat_store
+ sc0=1 sc1=1
+ 2. s_waitcnt vmcnt(0)
+
+ - Must happen before
+ any following volatile
+ global/generic
+ load/store.
+ - Ensures that
+ volatile
+ operations to
+
diff erent
+ addresses will not
+ be reordered by
+ hardware.
+
+ store *none* *none* - local 1. ds_store
+ **Unordered Atomic**
+ ------------------------------------------------------------------------------------
+ load atomic unordered *any* *any* *Same as non-atomic*.
+ store atomic unordered *any* *any* *Same as non-atomic*.
+ atomicrmw unordered *any* *any* *Same as monotonic atomic*.
+ **Monotonic Atomic**
+ ------------------------------------------------------------------------------------
+ load atomic monotonic - singlethread - global 1. buffer/global/flat_load
+ - wavefront - generic
+ load atomic monotonic - workgroup - global 1. buffer/global/flat_load
+ - generic sc0=1
+ load atomic monotonic - singlethread - local *If TgSplit execution mode,
+ - wavefront local address space cannot
+ - workgroup be used.*
+
+ 1. ds_load
+ load atomic monotonic - agent - global 1. buffer/global/flat_load
+ - generic sc1=1
+ load atomic monotonic - system - global 1. buffer/global/flat_load
+ - generic sc0=1 sc1=1
+ store atomic monotonic - singlethread - global 1. buffer/global/flat_store
+ - wavefront - generic
+ store atomic monotonic - singlethread - global 1. buffer/global/flat_store
+ - wavefront - generic
+ store atomic monotonic - workgroup - global 1. buffer/global/flat_store
+ - generic sc0=1
+ store atomic monotonic - agent - global 1. buffer/global/flat_store
+ - generic sc1=1
+ store atomic monotonic - system - global 1. buffer/global/flat_store
+ - generic sc0=1 sc1=1
+ store atomic monotonic - singlethread - local *If TgSplit execution mode,
+ - wavefront local address space cannot
+ - workgroup be used.*
+
+ 1. ds_store
+ atomicrmw monotonic - singlethread - global 1. buffer/global/flat_atomic
+ - wavefront - generic
+ - workgroup
+ - agent
+ atomicrmw monotonic - system - global 1. buffer/global/flat_atomic
+ - generic sc1=1
+ atomicrmw monotonic - singlethread - local *If TgSplit execution mode,
+ - wavefront local address space cannot
+ - workgroup be used.*
+
+ 1. ds_atomic
+ **Acquire Atomic**
+ ------------------------------------------------------------------------------------
+ load atomic acquire - singlethread - global 1. buffer/global/ds/flat_load
+ - wavefront - local
+ - generic
+ load atomic acquire - workgroup - global 1. buffer/global_load sc0=1
+ 2. s_waitcnt vmcnt(0)
+
+ - If not TgSplit execution
+ mode, omit.
+ - Must happen before the
+ following buffer_inv.
+
+ 3. buffer_inv sc0=1
+
+ - If not TgSplit execution
+ mode, omit.
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/store/store
+ atomic/atomicrmw.
+ - Ensures that
+ following
+ loads will not see
+ stale data.
+
+ load atomic acquire - workgroup - local *If TgSplit execution mode,
+ local address space cannot
+ be used.*
+
+ 1. ds_load
+ 2. s_waitcnt lgkmcnt(0)
+
+ - If OpenCL, omit.
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/store/store
+ atomic/atomicrmw.
+ - Ensures any
+ following global
+ data read is no
+ older than the local load
+ atomic value being
+ acquired.
+
+ load atomic acquire - workgroup - generic 1. flat_load sc0=1
+ 2. s_waitcnt lgkm/vmcnt(0)
+
+ - Use lgkmcnt(0) if not
+ TgSplit execution mode
+ and vmcnt(0) if TgSplit
+ execution mode.
+ - If OpenCL, omit lgkmcnt(0).
+ - Must happen before
+ the following
+ buffer_inv and any
+ following global/generic
+ load/load
+ atomic/store/store
+ atomic/atomicrmw.
+ - Ensures any
+ following global
+ data read is no
+ older than a local load
+ atomic value being
+ acquired.
+
+ 3. buffer_inv sc0=1
+
+ - If not TgSplit execution
+ mode, omit.
+ - Ensures that
+ following
+ loads will not see
+ stale data.
+
+ load atomic acquire - agent - global 1. buffer/global_load
+ sc1=1
+ 2. s_waitcnt vmcnt(0)
+
+ - Must happen before
+ following
+ buffer_inv.
+ - Ensures the load
+ has completed
+ before invalidating
+ the cache.
+
+ 3. buffer_inv sc1=1
+
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/atomicrmw.
+ - Ensures that
+ following
+ loads will not see
+ stale global data.
+
+ load atomic acquire - system - global 1. buffer/global/flat_load
+ sc0=1 sc1=1
+ 2. s_waitcnt vmcnt(0)
+
+ - Must happen before
+ following
+ buffer_inv.
+ - Ensures the load
+ has completed
+ before invalidating
+ the cache.
+
+ 3. buffer_inv sc0=1 sc1=1
+
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/atomicrmw.
+ - Ensures that
+ following
+ loads will not see
+ stale MTYPE NC global data.
+ MTYPE RW and CC memory will
+ never be stale due to the
+ memory probes.
+
+ load atomic acquire - agent - generic 1. flat_load sc1=1
+ 2. s_waitcnt vmcnt(0) &
+ lgkmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL omit
+ lgkmcnt(0).
+ - Must happen before
+ following
+ buffer_inv.
+ - Ensures the flat_load
+ has completed
+ before invalidating
+ the cache.
+
+ 3. buffer_inv sc1=1
+
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/atomicrmw.
+ - Ensures that
+ following loads
+ will not see stale
+ global data.
+
+ load atomic acquire - system - generic 1. flat_load sc0=1 sc1=1
+ 2. s_waitcnt vmcnt(0) &
+ lgkmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL omit
+ lgkmcnt(0).
+ - Must happen before
+ the following
+ buffer_inv.
+ - Ensures the flat_load
+ has completed
+ before invalidating
+ the caches.
+
+ 3. buffer_inv sc0=1 sc1=1
+
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/atomicrmw.
+ - Ensures that
+ following
+ loads will not see
+ stale MTYPE NC global data.
+ MTYPE RW and CC memory will
+ never be stale due to the
+ memory probes.
+
+ atomicrmw acquire - singlethread - global 1. buffer/global/flat_atomic
+ - wavefront - generic
+ atomicrmw acquire - singlethread - local *If TgSplit execution mode,
+ - wavefront local address space cannot
+ be used.*
+
+ 1. ds_atomic
+ atomicrmw acquire - workgroup - global 1. buffer/global_atomic
+ 2. s_waitcnt vmcnt(0)
+
+ - If not TgSplit execution
+ mode, omit.
+ - Must happen before the
+ following buffer_inv.
+ - Ensures the atomicrmw
+ has completed
+ before invalidating
+ the cache.
+
+ 3. buffer_inv sc0=1
+
+ - If not TgSplit execution
+ mode, omit.
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/atomicrmw.
+ - Ensures that
+ following loads
+ will not see stale
+ global data.
+
+ atomicrmw acquire - workgroup - local *If TgSplit execution mode,
+ local address space cannot
+ be used.*
+
+ 1. ds_atomic
+ 2. s_waitcnt lgkmcnt(0)
+
+ - If OpenCL, omit.
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/store/store
+ atomic/atomicrmw.
+ - Ensures any
+ following global
+ data read is no
+ older than the local
+ atomicrmw value
+ being acquired.
+
+ atomicrmw acquire - workgroup - generic 1. flat_atomic
+ 2. s_waitcnt lgkm/vmcnt(0)
+
+ - Use lgkmcnt(0) if not
+ TgSplit execution mode
+ and vmcnt(0) if TgSplit
+ execution mode.
+ - If OpenCL, omit lgkmcnt(0).
+ - Must happen before
+ the following
+ buffer_inv and
+ any following
+ global/generic
+ load/load
+ atomic/store/store
+ atomic/atomicrmw.
+ - Ensures any
+ following global
+ data read is no
+ older than a local
+ atomicrmw value
+ being acquired.
+
+ 3. buffer_inv sc0=1
+
+ - If not TgSplit execution
+ mode, omit.
+ - Ensures that
+ following
+ loads will not see
+ stale data.
+
+ atomicrmw acquire - agent - global 1. buffer/global_atomic
+ 2. s_waitcnt vmcnt(0)
+
+ - Must happen before
+ following
+ buffer_inv.
+ - Ensures the
+ atomicrmw has
+ completed before
+ invalidating the
+ cache.
+
+ 3. buffer_inv sc1=1
+
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/atomicrmw.
+ - Ensures that
+ following loads
+ will not see stale
+ global data.
+
+ atomicrmw acquire - system - global 1. buffer/global_atomic
+ sc1=1
+ 2. s_waitcnt vmcnt(0)
+
+ - Must happen before
+ following
+ buffer_inv.
+ - Ensures the
+ atomicrmw has
+ completed before
+ invalidating the
+ caches.
+
+ 3. buffer_inv sc0=1 sc1=1
+
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/atomicrmw.
+ - Ensures that
+ following
+ loads will not see
+ stale MTYPE NC global data.
+ MTYPE RW and CC memory will
+ never be stale due to the
+ memory probes.
+
+ atomicrmw acquire - agent - generic 1. flat_atomic
+ 2. s_waitcnt vmcnt(0) &
+ lgkmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL, omit
+ lgkmcnt(0).
+ - Must happen before
+ following
+ buffer_inv.
+ - Ensures the
+ atomicrmw has
+ completed before
+ invalidating the
+ cache.
+
+ 3. buffer_inv sc1=1
+
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/atomicrmw.
+ - Ensures that
+ following loads
+ will not see stale
+ global data.
+
+ atomicrmw acquire - system - generic 1. flat_atomic sc1=1
+ 2. s_waitcnt vmcnt(0) &
+ lgkmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL, omit
+ lgkmcnt(0).
+ - Must happen before
+ following
+ buffer_inv.
+ - Ensures the
+ atomicrmw has
+ completed before
+ invalidating the
+ caches.
+
+ 3. buffer_inv sc0=1 sc1=1
+
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/atomicrmw.
+ - Ensures that
+ following
+ loads will not see
+ stale MTYPE NC global data.
+ MTYPE RW and CC memory will
+ never be stale due to the
+ memory probes.
+
+ fence acquire - singlethread *none* *none*
+ - wavefront
+ fence acquire - workgroup *none* 1. s_waitcnt lgkm/vmcnt(0)
+
+ - Use lgkmcnt(0) if not
+ TgSplit execution mode
+ and vmcnt(0) if TgSplit
+ execution mode.
+ - If OpenCL and
+ address space is
+ not generic, omit
+ lgkmcnt(0).
+ - If OpenCL and
+ address space is
+ local, omit
+ vmcnt(0).
+ - However, since LLVM
+ currently has no
+ address space on
+ the fence need to
+ conservatively
+ always generate. If
+ fence had an
+ address space then
+ set to address
+ space of OpenCL
+ fence flag, or to
+ generic if both
+ local and global
+ flags are
+ specified.
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic load
+ atomic/
+ atomicrmw
+ with an equal or
+ wider sync scope
+ and memory ordering
+ stronger than
+ unordered (this is
+ termed the
+ fence-paired-atomic).
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic load
+ atomic/atomicrmw
+ with an equal or
+ wider sync scope
+ and memory ordering
+ stronger than
+ unordered (this is
+ termed the
+ fence-paired-atomic).
+ - Must happen before
+ the following
+ buffer_inv and
+ any following
+ global/generic
+ load/load
+ atomic/store/store
+ atomic/atomicrmw.
+ - Ensures any
+ following global
+ data read is no
+ older than the
+ value read by the
+ fence-paired-atomic.
+
+ 3. buffer_inv sc0=1
+
+ - If not TgSplit execution
+ mode, omit.
+ - Ensures that
+ following
+ loads will not see
+ stale data.
+
+ fence acquire - agent *none* 1. s_waitcnt lgkmcnt(0) &
+ vmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL and
+ address space is
+ not generic, omit
+ lgkmcnt(0).
+ - However, since LLVM
+ currently has no
+ address space on
+ the fence need to
+ conservatively
+ always generate
+ (see comment for
+ previous fence).
+ - Could be split into
+ separate s_waitcnt
+ vmcnt(0) and
+ s_waitcnt
+ lgkmcnt(0) to allow
+ them to be
+ independently moved
+ according to the
+ following rules.
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic load
+ atomic/atomicrmw
+ with an equal or
+ wider sync scope
+ and memory ordering
+ stronger than
+ unordered (this is
+ termed the
+ fence-paired-atomic).
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic load
+ atomic/atomicrmw
+ with an equal or
+ wider sync scope
+ and memory ordering
+ stronger than
+ unordered (this is
+ termed the
+ fence-paired-atomic).
+ - Must happen before
+ the following
+ buffer_inv.
+ - Ensures that the
+ fence-paired atomic
+ has completed
+ before invalidating
+ the
+ cache. Therefore
+ any following
+ locations read must
+ be no older than
+ the value read by
+ the
+ fence-paired-atomic.
+
+ 2. buffer_inv sc1=1
+
+ - Must happen before any
+ following global/generic
+ load/load
+ atomic/store/store
+ atomic/atomicrmw.
+ - Ensures that
+ following loads
+ will not see stale
+ global data.
+
+ fence acquire - system *none* 1. s_waitcnt lgkmcnt(0) &
+ vmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL and
+ address space is
+ not generic, omit
+ lgkmcnt(0).
+ - However, since LLVM
+ currently has no
+ address space on
+ the fence need to
+ conservatively
+ always generate
+ (see comment for
+ previous fence).
+ - Could be split into
+ separate s_waitcnt
+ vmcnt(0) and
+ s_waitcnt
+ lgkmcnt(0) to allow
+ them to be
+ independently moved
+ according to the
+ following rules.
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic load
+ atomic/atomicrmw
+ with an equal or
+ wider sync scope
+ and memory ordering
+ stronger than
+ unordered (this is
+ termed the
+ fence-paired-atomic).
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic load
+ atomic/atomicrmw
+ with an equal or
+ wider sync scope
+ and memory ordering
+ stronger than
+ unordered (this is
+ termed the
+ fence-paired-atomic).
+ - Must happen before
+ the following
+ buffer_inv.
+ - Ensures that the
+ fence-paired atomic
+ has completed
+ before invalidating
+ the
+ cache. Therefore
+ any following
+ locations read must
+ be no older than
+ the value read by
+ the
+ fence-paired-atomic.
+
+ 2. buffer_inv sc0=1 sc1=1
+
+ - Must happen before any
+ following global/generic
+ load/load
+ atomic/store/store
+ atomic/atomicrmw.
+ - Ensures that
+ following loads
+ will not see stale
+ global data.
+
+ **Release Atomic**
+ ------------------------------------------------------------------------------------
+ store atomic release - singlethread - global 1. buffer/global/flat_store
+ - wavefront - generic
+ store atomic release - singlethread - local *If TgSplit execution mode,
+ - wavefront local address space cannot
+ be used.*
+
+ 1. ds_store
+ store atomic release - workgroup - global 1. s_waitcnt lgkm/vmcnt(0)
+ - generic
+ - Use lgkmcnt(0) if not
+ TgSplit execution mode
+ and vmcnt(0) if TgSplit
+ execution mode.
+ - If OpenCL, omit lgkmcnt(0).
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic load/store/
+ load atomic/store atomic/
+ atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - Must happen before
+ the following
+ store.
+ - Ensures that all
+ memory operations
+ have
+ completed before
+ performing the
+ store that is being
+ released.
+
+ 2. buffer/global/flat_store sc0=1
+ store atomic release - workgroup - local *If TgSplit execution mode,
+ local address space cannot
+ be used.*
+
+ 1. ds_store
+ store atomic release - agent - global 1. buffer_wbl2 sc1=1
+ - generic
+ - Must happen before
+ following s_waitcnt.
+ - Performs L2 writeback to
+ ensure previous
+ global/generic
+ store/atomicrmw are
+ visible at agent scope.
+
+ 2. s_waitcnt lgkmcnt(0) &
+ vmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL and
+ address space is
+ not generic, omit
+ lgkmcnt(0).
+ - Could be split into
+ separate s_waitcnt
+ vmcnt(0) and
+ s_waitcnt
+ lgkmcnt(0) to allow
+ them to be
+ independently moved
+ according to the
+ following rules.
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - Must happen before
+ the following
+ store.
+ - Ensures that all
+ memory operations
+ to memory have
+ completed before
+ performing the
+ store that is being
+ released.
+
+ 3. buffer/global/flat_store sc1=1
+ store atomic release - system - global 1. buffer_wbl2 sc0=1 sc1=1
+ - generic
+ - Must happen before
+ following s_waitcnt.
+ - Performs L2 writeback to
+ ensure previous
+ global/generic
+ store/atomicrmw are
+ visible at system scope.
+
+ 2. s_waitcnt lgkmcnt(0) &
+ vmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL and
+ address space is
+ not generic, omit
+ lgkmcnt(0).
+ - Could be split into
+ separate s_waitcnt
+ vmcnt(0) and
+ s_waitcnt
+ lgkmcnt(0) to allow
+ them to be
+ independently moved
+ according to the
+ following rules.
+ - s_waitcnt vmcnt(0)
+ must happen after any
+ preceding
+ global/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after any
+ preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - Must happen before
+ the following
+ store.
+ - Ensures that all
+ memory operations
+ to memory and the L2
+ writeback have
+ completed before
+ performing the
+ store that is being
+ released.
+
+ 2. buffer/global/flat_store
+ sc0=1 sc1=1
+ atomicrmw release - singlethread - global 1. buffer/global/flat_atomic
+ - wavefront - generic
+ atomicrmw release - singlethread - local *If TgSplit execution mode,
+ - wavefront local address space cannot
+ be used.*
+
+ 1. ds_atomic
+ atomicrmw release - workgroup - global 1. s_waitcnt lgkm/vmcnt(0)
+ - generic
+ - Use lgkmcnt(0) if not
+ TgSplit execution mode
+ and vmcnt(0) if TgSplit
+ execution mode.
+ - If OpenCL, omit
+ lgkmcnt(0).
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic load/store/
+ load atomic/store atomic/
+ atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - Must happen before
+ the following
+ atomicrmw.
+ - Ensures that all
+ memory operations
+ have
+ completed before
+ performing the
+ atomicrmw that is
+ being released.
+
+ 2. buffer/global/flat_atomic sc0=1
+ atomicrmw release - workgroup - local *If TgSplit execution mode,
+ local address space cannot
+ be used.*
+
+ 1. ds_atomic
+ atomicrmw release - agent - global 1. buffer_wbl2 sc1=1
+ - generic
+ - Must happen before
+ following s_waitcnt.
+ - Performs L2 writeback to
+ ensure previous
+ global/generic
+ store/atomicrmw are
+ visible at agent scope.
+
+ 2. s_waitcnt lgkmcnt(0) &
+ vmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL, omit
+ lgkmcnt(0).
+ - Could be split into
+ separate s_waitcnt
+ vmcnt(0) and
+ s_waitcnt
+ lgkmcnt(0) to allow
+ them to be
+ independently moved
+ according to the
+ following rules.
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - Must happen before
+ the following
+ atomicrmw.
+ - Ensures that all
+ memory operations
+ to global and local
+ have completed
+ before performing
+ the atomicrmw that
+ is being released.
+
+ 3. buffer/global/flat_atomic sc1=1
+ atomicrmw release - system - global 1. buffer_wbl2 sc0=1 sc1=1
+ - generic
+ - Must happen before
+ following s_waitcnt.
+ - Performs L2 writeback to
+ ensure previous
+ global/generic
+ store/atomicrmw are
+ visible at system scope.
+
+ 2. s_waitcnt lgkmcnt(0) &
+ vmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL, omit
+ lgkmcnt(0).
+ - Could be split into
+ separate s_waitcnt
+ vmcnt(0) and
+ s_waitcnt
+ lgkmcnt(0) to allow
+ them to be
+ independently moved
+ according to the
+ following rules.
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - Must happen before
+ the following
+ atomicrmw.
+ - Ensures that all
+ memory operations
+ to memory and the L2
+ writeback have
+ completed before
+ performing the
+ store that is being
+ released.
+
+ 3. buffer/global/flat_atomic
+ sc0=1 sc1=1
+ fence release - singlethread *none* *none*
+ - wavefront
+ fence release - workgroup *none* 1. s_waitcnt lgkm/vmcnt(0)
+
+ - Use lgkmcnt(0) if not
+ TgSplit execution mode
+ and vmcnt(0) if TgSplit
+ execution mode.
+ - If OpenCL and
+ address space is
+ not generic, omit
+ lgkmcnt(0).
+ - If OpenCL and
+ address space is
+ local, omit
+ vmcnt(0).
+ - However, since LLVM
+ currently has no
+ address space on
+ the fence need to
+ conservatively
+ always generate. If
+ fence had an
+ address space then
+ set to address
+ space of OpenCL
+ fence flag, or to
+ generic if both
+ local and global
+ flags are
+ specified.
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic
+ load/store/
+ load atomic/store atomic/
+ atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic
+ load/load
+ atomic/store/store
+ atomic/atomicrmw.
+ - Must happen before
+ any following store
+ atomic/atomicrmw
+ with an equal or
+ wider sync scope
+ and memory ordering
+ stronger than
+ unordered (this is
+ termed the
+ fence-paired-atomic).
+ - Ensures that all
+ memory operations
+ have
+ completed before
+ performing the
+ following
+ fence-paired-atomic.
+
+ fence release - agent *none* 1. buffer_wbl2 sc1=1
+
+ - If OpenCL and
+ address space is
+ local, omit.
+ - Must happen before
+ following s_waitcnt.
+ - Performs L2 writeback to
+ ensure previous
+ global/generic
+ store/atomicrmw are
+ visible at agent scope.
+
+ 2. s_waitcnt lgkmcnt(0) &
+ vmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL and
+ address space is
+ not generic, omit
+ lgkmcnt(0).
+ - If OpenCL and
+ address space is
+ local, omit
+ vmcnt(0).
+ - However, since LLVM
+ currently has no
+ address space on
+ the fence need to
+ conservatively
+ always generate. If
+ fence had an
+ address space then
+ set to address
+ space of OpenCL
+ fence flag, or to
+ generic if both
+ local and global
+ flags are
+ specified.
+ - Could be split into
+ separate s_waitcnt
+ vmcnt(0) and
+ s_waitcnt
+ lgkmcnt(0) to allow
+ them to be
+ independently moved
+ according to the
+ following rules.
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - Must happen before
+ any following store
+ atomic/atomicrmw
+ with an equal or
+ wider sync scope
+ and memory ordering
+ stronger than
+ unordered (this is
+ termed the
+ fence-paired-atomic).
+ - Ensures that all
+ memory operations
+ have
+ completed before
+ performing the
+ following
+ fence-paired-atomic.
+
+ fence release - system *none* 1. buffer_wbl2 sc0=1 sc1=1
+
+ - Must happen before
+ following s_waitcnt.
+ - Performs L2 writeback to
+ ensure previous
+ global/generic
+ store/atomicrmw are
+ visible at system scope.
+
+ 2. s_waitcnt lgkmcnt(0) &
+ vmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL and
+ address space is
+ not generic, omit
+ lgkmcnt(0).
+ - If OpenCL and
+ address space is
+ local, omit
+ vmcnt(0).
+ - However, since LLVM
+ currently has no
+ address space on
+ the fence need to
+ conservatively
+ always generate. If
+ fence had an
+ address space then
+ set to address
+ space of OpenCL
+ fence flag, or to
+ generic if both
+ local and global
+ flags are
+ specified.
+ - Could be split into
+ separate s_waitcnt
+ vmcnt(0) and
+ s_waitcnt
+ lgkmcnt(0) to allow
+ them to be
+ independently moved
+ according to the
+ following rules.
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - Must happen before
+ any following store
+ atomic/atomicrmw
+ with an equal or
+ wider sync scope
+ and memory ordering
+ stronger than
+ unordered (this is
+ termed the
+ fence-paired-atomic).
+ - Ensures that all
+ memory operations
+ have
+ completed before
+ performing the
+ following
+ fence-paired-atomic.
+
+ **Acquire-Release Atomic**
+ ------------------------------------------------------------------------------------
+ atomicrmw acq_rel - singlethread - global 1. buffer/global/flat_atomic
+ - wavefront - generic
+ atomicrmw acq_rel - singlethread - local *If TgSplit execution mode,
+ - wavefront local address space cannot
+ be used.*
+
+ 1. ds_atomic
+ atomicrmw acq_rel - workgroup - global 1. s_waitcnt lgkm/vmcnt(0)
+
+ - Use lgkmcnt(0) if not
+ TgSplit execution mode
+ and vmcnt(0) if TgSplit
+ execution mode.
+ - If OpenCL, omit
+ lgkmcnt(0).
+ - Must happen after
+ any preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic load/store/
+ load atomic/store atomic/
+ atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - Must happen before
+ the following
+ atomicrmw.
+ - Ensures that all
+ memory operations
+ have
+ completed before
+ performing the
+ atomicrmw that is
+ being released.
+
+ 2. buffer/global_atomic
+ 3. s_waitcnt vmcnt(0)
+
+ - If not TgSplit execution
+ mode, omit.
+ - Must happen before
+ the following
+ buffer_inv.
+ - Ensures any
+ following global
+ data read is no
+ older than the
+ atomicrmw value
+ being acquired.
+
+ 4. buffer_inv sc0=1
+
+ - If not TgSplit execution
+ mode, omit.
+ - Ensures that
+ following
+ loads will not see
+ stale data.
+
+ atomicrmw acq_rel - workgroup - local *If TgSplit execution mode,
+ local address space cannot
+ be used.*
+
+ 1. ds_atomic
+ 2. s_waitcnt lgkmcnt(0)
+
+ - If OpenCL, omit.
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/store/store
+ atomic/atomicrmw.
+ - Ensures any
+ following global
+ data read is no
+ older than the local load
+ atomic value being
+ acquired.
+
+ atomicrmw acq_rel - workgroup - generic 1. s_waitcnt lgkm/vmcnt(0)
+
+ - Use lgkmcnt(0) if not
+ TgSplit execution mode
+ and vmcnt(0) if TgSplit
+ execution mode.
+ - If OpenCL, omit
+ lgkmcnt(0).
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic load/store/
+ load atomic/store atomic/
+ atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - Must happen before
+ the following
+ atomicrmw.
+ - Ensures that all
+ memory operations
+ have
+ completed before
+ performing the
+ atomicrmw that is
+ being released.
+
+ 2. flat_atomic
+ 3. s_waitcnt lgkmcnt(0) &
+ vmcnt(0)
+
+ - If not TgSplit execution
+ mode, omit vmcnt(0).
+ - If OpenCL, omit
+ lgkmcnt(0).
+ - Must happen before
+ the following
+ buffer_inv and
+ any following
+ global/generic
+ load/load
+ atomic/store/store
+ atomic/atomicrmw.
+ - Ensures any
+ following global
+ data read is no
+ older than a local load
+ atomic value being
+ acquired.
+
+ 3. buffer_inv sc0=1
+
+ - If not TgSplit execution
+ mode, omit.
+ - Ensures that
+ following
+ loads will not see
+ stale data.
+
+ atomicrmw acq_rel - agent - global 1. buffer_wbl2 sc1=1
+
+ - Must happen before
+ following s_waitcnt.
+ - Performs L2 writeback to
+ ensure previous
+ global/generic
+ store/atomicrmw are
+ visible at agent scope.
+
+ 2. s_waitcnt lgkmcnt(0) &
+ vmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL, omit
+ lgkmcnt(0).
+ - Could be split into
+ separate s_waitcnt
+ vmcnt(0) and
+ s_waitcnt
+ lgkmcnt(0) to allow
+ them to be
+ independently moved
+ according to the
+ following rules.
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - Must happen before
+ the following
+ atomicrmw.
+ - Ensures that all
+ memory operations
+ to global have
+ completed before
+ performing the
+ atomicrmw that is
+ being released.
+
+ 3. buffer/global_atomic
+ 4. s_waitcnt vmcnt(0)
+
+ - Must happen before
+ following
+ buffer_inv.
+ - Ensures the
+ atomicrmw has
+ completed before
+ invalidating the
+ cache.
+
+ 5. buffer_inv sc1=1
+
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/atomicrmw.
+ - Ensures that
+ following loads
+ will not see stale
+ global data.
+
+ atomicrmw acq_rel - system - global 1. buffer_wbl2 sc0=1 sc1=1
+
+ - Must happen before
+ following s_waitcnt.
+ - Performs L2 writeback to
+ ensure previous
+ global/generic
+ store/atomicrmw are
+ visible at system scope.
+
+ 2. s_waitcnt lgkmcnt(0) &
+ vmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL, omit
+ lgkmcnt(0).
+ - Could be split into
+ separate s_waitcnt
+ vmcnt(0) and
+ s_waitcnt
+ lgkmcnt(0) to allow
+ them to be
+ independently moved
+ according to the
+ following rules.
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - Must happen before
+ the following
+ atomicrmw.
+ - Ensures that all
+ memory operations
+ to global and L2 writeback
+ have completed before
+ performing the
+ atomicrmw that is
+ being released.
+
+ 3. buffer/global_atomic
+ sc1=1
+ 4. s_waitcnt vmcnt(0)
+
+ - Must happen before
+ following
+ buffer_inv.
+ - Ensures the
+ atomicrmw has
+ completed before
+ invalidating the
+ caches.
+
+ 5. buffer_inv sc0=1 sc1=1
+
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/atomicrmw.
+ - Ensures that
+ following loads
+ will not see stale
+ MTYPE NC global data.
+ MTYPE RW and CC memory will
+ never be stale due to the
+ memory probes.
+
+ atomicrmw acq_rel - agent - generic 1. buffer_wbl2 sc1=1
+
+ - Must happen before
+ following s_waitcnt.
+ - Performs L2 writeback to
+ ensure previous
+ global/generic
+ store/atomicrmw are
+ visible at agent scope.
+
+ 2. s_waitcnt lgkmcnt(0) &
+ vmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL, omit
+ lgkmcnt(0).
+ - Could be split into
+ separate s_waitcnt
+ vmcnt(0) and
+ s_waitcnt
+ lgkmcnt(0) to allow
+ them to be
+ independently moved
+ according to the
+ following rules.
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - Must happen before
+ the following
+ atomicrmw.
+ - Ensures that all
+ memory operations
+ to global have
+ completed before
+ performing the
+ atomicrmw that is
+ being released.
+
+ 3. flat_atomic
+ 4. s_waitcnt vmcnt(0) &
+ lgkmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL, omit
+ lgkmcnt(0).
+ - Must happen before
+ following
+ buffer_inv.
+ - Ensures the
+ atomicrmw has
+ completed before
+ invalidating the
+ cache.
+
+ 5. buffer_inv sc1=1
+
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/atomicrmw.
+ - Ensures that
+ following loads
+ will not see stale
+ global data.
+
+ atomicrmw acq_rel - system - generic 1. buffer_wbl2 sc0=1 sc1=1
+
+ - Must happen before
+ following s_waitcnt.
+ - Performs L2 writeback to
+ ensure previous
+ global/generic
+ store/atomicrmw are
+ visible at system scope.
+
+ 2. s_waitcnt lgkmcnt(0) &
+ vmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL, omit
+ lgkmcnt(0).
+ - Could be split into
+ separate s_waitcnt
+ vmcnt(0) and
+ s_waitcnt
+ lgkmcnt(0) to allow
+ them to be
+ independently moved
+ according to the
+ following rules.
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - Must happen before
+ the following
+ atomicrmw.
+ - Ensures that all
+ memory operations
+ to global and L2 writeback
+ have completed before
+ performing the
+ atomicrmw that is
+ being released.
+
+ 3. flat_atomic sc1=1
+ 4. s_waitcnt vmcnt(0) &
+ lgkmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL, omit
+ lgkmcnt(0).
+ - Must happen before
+ following
+ buffer_inv.
+ - Ensures the
+ atomicrmw has
+ completed before
+ invalidating the
+ caches.
+
+ 5. buffer_inv sc0=1 sc1=1
+
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/atomicrmw.
+ - Ensures that
+ following loads
+ will not see stale
+ MTYPE NC global data.
+ MTYPE RW and CC memory will
+ never be stale due to the
+ memory probes.
+
+ fence acq_rel - singlethread *none* *none*
+ - wavefront
+ fence acq_rel - workgroup *none* 1. s_waitcnt lgkm/vmcnt(0)
+
+ - Use lgkmcnt(0) if not
+ TgSplit execution mode
+ and vmcnt(0) if TgSplit
+ execution mode.
+ - If OpenCL and
+ address space is
+ not generic, omit
+ lgkmcnt(0).
+ - If OpenCL and
+ address space is
+ local, omit
+ vmcnt(0).
+ - However,
+ since LLVM
+ currently has no
+ address space on
+ the fence need to
+ conservatively
+ always generate
+ (see comment for
+ previous fence).
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic
+ load/store/
+ load atomic/store atomic/
+ atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic
+ load/load
+ atomic/store/store
+ atomic/atomicrmw.
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/store/store
+ atomic/atomicrmw.
+ - Ensures that all
+ memory operations
+ have
+ completed before
+ performing any
+ following global
+ memory operations.
+ - Ensures that the
+ preceding
+ local/generic load
+ atomic/atomicrmw
+ with an equal or
+ wider sync scope
+ and memory ordering
+ stronger than
+ unordered (this is
+ termed the
+ acquire-fence-paired-atomic)
+ has completed
+ before following
+ global memory
+ operations. This
+ satisfies the
+ requirements of
+ acquire.
+ - Ensures that all
+ previous memory
+ operations have
+ completed before a
+ following
+ local/generic store
+ atomic/atomicrmw
+ with an equal or
+ wider sync scope
+ and memory ordering
+ stronger than
+ unordered (this is
+ termed the
+ release-fence-paired-atomic).
+ This satisfies the
+ requirements of
+ release.
+ - Must happen before
+ the following
+ buffer_inv.
+ - Ensures that the
+ acquire-fence-paired
+ atomic has completed
+ before invalidating
+ the
+ cache. Therefore
+ any following
+ locations read must
+ be no older than
+ the value read by
+ the
+ acquire-fence-paired-atomic.
+
+ 3. buffer_inv sc0=1
+
+ - If not TgSplit execution
+ mode, omit.
+ - Ensures that
+ following
+ loads will not see
+ stale data.
+
+ fence acq_rel - agent *none* 1. buffer_wbl2 sc1=1
+
+ - If OpenCL and
+ address space is
+ local, omit.
+ - Must happen before
+ following s_waitcnt.
+ - Performs L2 writeback to
+ ensure previous
+ global/generic
+ store/atomicrmw are
+ visible at agent scope.
+
+ 2. s_waitcnt lgkmcnt(0) &
+ vmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL and
+ address space is
+ not generic, omit
+ lgkmcnt(0).
+ - However, since LLVM
+ currently has no
+ address space on
+ the fence need to
+ conservatively
+ always generate
+ (see comment for
+ previous fence).
+ - Could be split into
+ separate s_waitcnt
+ vmcnt(0) and
+ s_waitcnt
+ lgkmcnt(0) to allow
+ them to be
+ independently moved
+ according to the
+ following rules.
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - Must happen before
+ the following
+ buffer_inv.
+ - Ensures that the
+ preceding
+ global/local/generic
+ load
+ atomic/atomicrmw
+ with an equal or
+ wider sync scope
+ and memory ordering
+ stronger than
+ unordered (this is
+ termed the
+ acquire-fence-paired-atomic)
+ has completed
+ before invalidating
+ the cache. This
+ satisfies the
+ requirements of
+ acquire.
+ - Ensures that all
+ previous memory
+ operations have
+ completed before a
+ following
+ global/local/generic
+ store
+ atomic/atomicrmw
+ with an equal or
+ wider sync scope
+ and memory ordering
+ stronger than
+ unordered (this is
+ termed the
+ release-fence-paired-atomic).
+ This satisfies the
+ requirements of
+ release.
+
+ 3. buffer_inv sc1=1
+
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/store/store
+ atomic/atomicrmw.
+ - Ensures that
+ following loads
+ will not see stale
+ global data. This
+ satisfies the
+ requirements of
+ acquire.
+
+ fence acq_rel - system *none* 1. buffer_wbl2 sc0=1 sc1=1
+
+ - If OpenCL and
+ address space is
+ local, omit.
+ - Must happen before
+ following s_waitcnt.
+ - Performs L2 writeback to
+ ensure previous
+ global/generic
+ store/atomicrmw are
+ visible at system scope.
+
+ 1. s_waitcnt lgkmcnt(0) &
+ vmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - If OpenCL and
+ address space is
+ not generic, omit
+ lgkmcnt(0).
+ - However, since LLVM
+ currently has no
+ address space on
+ the fence need to
+ conservatively
+ always generate
+ (see comment for
+ previous fence).
+ - Could be split into
+ separate s_waitcnt
+ vmcnt(0) and
+ s_waitcnt
+ lgkmcnt(0) to allow
+ them to be
+ independently moved
+ according to the
+ following rules.
+ - s_waitcnt vmcnt(0)
+ must happen after
+ any preceding
+ global/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ any preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
+ - Must happen before
+ the following
+ buffer_inv.
+ - Ensures that the
+ preceding
+ global/local/generic
+ load
+ atomic/atomicrmw
+ with an equal or
+ wider sync scope
+ and memory ordering
+ stronger than
+ unordered (this is
+ termed the
+ acquire-fence-paired-atomic)
+ has completed
+ before invalidating
+ the cache. This
+ satisfies the
+ requirements of
+ acquire.
+ - Ensures that all
+ previous memory
+ operations have
+ completed before a
+ following
+ global/local/generic
+ store
+ atomic/atomicrmw
+ with an equal or
+ wider sync scope
+ and memory ordering
+ stronger than
+ unordered (this is
+ termed the
+ release-fence-paired-atomic).
+ This satisfies the
+ requirements of
+ release.
+
+ 2. buffer_inv sc0=1 sc1=1
+
+ - Must happen before
+ any following
+ global/generic
+ load/load
+ atomic/store/store
+ atomic/atomicrmw.
+ - Ensures that
+ following loads
+ will not see stale
+ MTYPE NC global data.
+ MTYPE RW and CC memory will
+ never be stale due to the
+ memory probes.
+
+ **Sequential Consistent Atomic**
+ ------------------------------------------------------------------------------------
+ load atomic seq_cst - singlethread - global *Same as corresponding
+ - wavefront - local load atomic acquire,
+ - generic except must generated
+ all instructions even
+ for OpenCL.*
+ load atomic seq_cst - workgroup - global 1. s_waitcnt lgkm/vmcnt(0)
+ - generic
+ - Use lgkmcnt(0) if not
+ TgSplit execution mode
+ and vmcnt(0) if TgSplit
+ execution mode.
+ - s_waitcnt lgkmcnt(0) must
+ happen after
+ preceding
+ local/generic load
+ atomic/store
+ atomic/atomicrmw
+ with memory
+ ordering of seq_cst
+ and with equal or
+ wider sync scope.
+ (Note that seq_cst
+ fences have their
+ own s_waitcnt
+ lgkmcnt(0) and so do
+ not need to be
+ considered.)
+ - s_waitcnt vmcnt(0)
+ must happen after
+ preceding
+ global/generic load
+ atomic/store
+ atomic/atomicrmw
+ with memory
+ ordering of seq_cst
+ and with equal or
+ wider sync scope.
+ (Note that seq_cst
+ fences have their
+ own s_waitcnt
+ vmcnt(0) and so do
+ not need to be
+ considered.)
+ - Ensures any
+ preceding
+ sequential
+ consistent global/local
+ memory instructions
+ have completed
+ before executing
+ this sequentially
+ consistent
+ instruction. This
+ prevents reordering
+ a seq_cst store
+ followed by a
+ seq_cst load. (Note
+ that seq_cst is
+ stronger than
+ acquire/release as
+ the reordering of
+ load acquire
+ followed by a store
+ release is
+ prevented by the
+ s_waitcnt of
+ the release, but
+ there is nothing
+ preventing a store
+ release followed by
+ load acquire from
+ completing out of
+ order. The s_waitcnt
+ could be placed after
+ seq_store or before
+ the seq_load. We
+ choose the load to
+ make the s_waitcnt be
+ as late as possible
+ so that the store
+ may have already
+ completed.)
+
+ 2. *Following
+ instructions same as
+ corresponding load
+ atomic acquire,
+ except must generated
+ all instructions even
+ for OpenCL.*
+ load atomic seq_cst - workgroup - local *If TgSplit execution mode,
+ local address space cannot
+ be used.*
+
+ *Same as corresponding
+ load atomic acquire,
+ except must generated
+ all instructions even
+ for OpenCL.*
+
+ load atomic seq_cst - agent - global 1. s_waitcnt lgkmcnt(0) &
+ - system - generic vmcnt(0)
+
+ - If TgSplit execution mode,
+ omit lgkmcnt(0).
+ - Could be split into
+ separate s_waitcnt
+ vmcnt(0)
+ and s_waitcnt
+ lgkmcnt(0) to allow
+ them to be
+ independently moved
+ according to the
+ following rules.
+ - s_waitcnt lgkmcnt(0)
+ must happen after
+ preceding
+ global/generic load
+ atomic/store
+ atomic/atomicrmw
+ with memory
+ ordering of seq_cst
+ and with equal or
+ wider sync scope.
+ (Note that seq_cst
+ fences have their
+ own s_waitcnt
+ lgkmcnt(0) and so do
+ not need to be
+ considered.)
+ - s_waitcnt vmcnt(0)
+ must happen after
+ preceding
+ global/generic load
+ atomic/store
+ atomic/atomicrmw
+ with memory
+ ordering of seq_cst
+ and with equal or
+ wider sync scope.
+ (Note that seq_cst
+ fences have their
+ own s_waitcnt
+ vmcnt(0) and so do
+ not need to be
+ considered.)
+ - Ensures any
+ preceding
+ sequential
+ consistent global
+ memory instructions
+ have completed
+ before executing
+ this sequentially
+ consistent
+ instruction. This
+ prevents reordering
+ a seq_cst store
+ followed by a
+ seq_cst load. (Note
+ that seq_cst is
+ stronger than
+ acquire/release as
+ the reordering of
+ load acquire
+ followed by a store
+ release is
+ prevented by the
+ s_waitcnt of
+ the release, but
+ there is nothing
+ preventing a store
+ release followed by
+ load acquire from
+ completing out of
+ order. The s_waitcnt
+ could be placed after
+ seq_store or before
+ the seq_load. We
+ choose the load to
+ make the s_waitcnt be
+ as late as possible
+ so that the store
+ may have already
+ completed.)
+
+ 2. *Following
+ instructions same as
+ corresponding load
+ atomic acquire,
+ except must generated
+ all instructions even
+ for OpenCL.*
+ store atomic seq_cst - singlethread - global *Same as corresponding
+ - wavefront - local store atomic release,
+ - workgroup - generic except must generated
+ - agent all instructions even
+ - system for OpenCL.*
+ atomicrmw seq_cst - singlethread - global *Same as corresponding
+ - wavefront - local atomicrmw acq_rel,
+ - workgroup - generic except must generated
+ - agent all instructions even
+ - system for OpenCL.*
+ fence seq_cst - singlethread *none* *Same as corresponding
+ - wavefront fence acq_rel,
+ - workgroup except must generated
+ - agent all instructions even
+ - system for OpenCL.*
+ ============ ============ ============== ========== ================================
+
.. _amdgpu-amdhsa-memory-model-gfx10:
Memory Model GFX10
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 8c8609c3c7100..728021a94c6ae 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -459,6 +459,56 @@ class SIGfx90ACacheControl : public SIGfx7CacheControl {
Position Pos) const override;
};
+class SIGfx940CacheControl : public SIGfx90ACacheControl {
+protected:
+
+ /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
+ /// is modified, false otherwise.
+ bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
+ return enableNamedBit(MI, AMDGPU::CPol::SC0);
+ }
+
+ /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
+ /// is modified, false otherwise.
+ bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
+ return enableNamedBit(MI, AMDGPU::CPol::SC1);
+ }
+
+ /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
+ /// is modified, false otherwise.
+ bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
+ return enableNamedBit(MI, AMDGPU::CPol::NT);
+ }
+
+public:
+
+ SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
+
+ bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const override;
+
+ bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+ SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile,
+ bool IsNonTemporal) const override;
+
+ bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, Position Pos) const override;
+
+ bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
+ Position Pos) const override;
+};
+
class SIGfx10CacheControl : public SIGfx7CacheControl {
protected:
@@ -775,6 +825,8 @@ bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
/* static */
std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
GCNSubtarget::Generation Generation = ST.getGeneration();
+ if (ST.hasGFX940Insts())
+ return std::make_unique<SIGfx940CacheControl>(ST);
if (ST.hasGFX90AInsts())
return std::make_unique<SIGfx90ACacheControl>(ST);
if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
@@ -1388,6 +1440,308 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
return Changed;
}
+bool SIGfx940CacheControl::enableLoadCacheBypass(
+ const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && !MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Set SC bits to indicate system scope.
+ Changed |= enableSC0Bit(MI);
+ Changed |= enableSC1Bit(MI);
+ break;
+ case SIAtomicScope::AGENT:
+ // Set SC bits to indicate agent scope.
+ Changed |= enableSC1Bit(MI);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In threadgroup split mode the waves of a work-group can be executing on
+ //
diff erent CUs. Therefore need to bypass the L1 which is per CU.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are
+ // on the same CU, and so the L1 does not need to be bypassed. Setting SC
+ // bits to indicate work-group scope will do this automatically.
+ Changed |= enableSC0Bit(MI);
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Leave SC bits unset to indicate wavefront scope.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ return Changed;
+}
+
+bool SIGfx940CacheControl::enableStoreCacheBypass(
+ const MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
+ assert(!MI->mayLoad() && MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Set SC bits to indicate system scope.
+ Changed |= enableSC0Bit(MI);
+ Changed |= enableSC1Bit(MI);
+ break;
+ case SIAtomicScope::AGENT:
+ // Set SC bits to indicate agent scope.
+ Changed |= enableSC1Bit(MI);
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // Set SC bits to indicate workgroup scope.
+ Changed |= enableSC0Bit(MI);
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Leave SC bits unset to indicate wavefront scope.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory caches
+ /// to be bypassed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ return Changed;
+}
+
+bool SIGfx940CacheControl::enableRMWCacheBypass(
+ const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace) const {
+ assert(MI->mayLoad() && MI->mayStore());
+ bool Changed = false;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Set SC1 bit to indicate system scope.
+ Changed |= enableSC1Bit(MI);
+ break;
+ case SIAtomicScope::AGENT:
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // RMW atomic operations implicitly bypass the L1 cache and only use SC1
+ // to indicate system or agent scope. The SC0 bit is used to indicate if
+ // they are return or no-return. Leave SC1 bit unset to indicate agent
+ // scope.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ return Changed;
+}
+
+bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
+ MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile, bool IsNonTemporal) const {
+ // Only handle load and store, not atomic read-modify-write insructions. The
+ // latter use glc to indicate if the atomic returns a result and so must not
+ // be used for cache control.
+ assert(MI->mayLoad() ^ MI->mayStore());
+
+ // Only update load and store, not LLVM IR atomic read-modify-write
+ // instructions. The latter are always marked as volatile so cannot sensibly
+ // handle it as do not want to pessimize all atomics. Also they do not support
+ // the nontemporal attribute.
+ assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+
+ bool Changed = false;
+
+ if (IsVolatile) {
+ // Set SC bits to indicate system scope.
+ Changed |= enableSC0Bit(MI);
+ Changed |= enableSC1Bit(MI);
+
+ // Ensure operation has completed at system scope to cause all volatile
+ // operations to be visible outside the program in a global order. Do not
+ // request cross address space as only the global address space can be
+ // observable outside the program, so no need to cause a waitcnt for LDS
+ // address space operations.
+ Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+ Position::AFTER);
+
+ return Changed;
+ }
+
+ if (IsNonTemporal) {
+ Changed |= enableNTBit(MI);
+ return Changed;
+ }
+
+ return Changed;
+}
+
+bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ Position Pos) const {
+ if (!InsertCacheInv)
+ return false;
+
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Ensures that following loads will not see stale remote VMEM data or
+ // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
+ // CC will never be stale due to the local memory probes.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate system scope.
+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
+ // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
+ // remove any cache lines of earlier writes by the same wave and ensures
+ // later reads by the same wave will refetch the cache lines.
+ Changed = true;
+ break;
+ case SIAtomicScope::AGENT:
+ // Ensures that following loads will not see stale remote date or local
+ // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
+ // due to the memory probes.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate agent scope.
+ .addImm(AMDGPU::CPol::SC1);
+ // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+ // does not reorder memory operations with respect to preceeding buffer
+ // invalidate. The invalidate is guaranteed to remove any cache lines of
+ // earlier writes and ensures later writes will refetch the cache lines.
+ Changed = true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In threadgroup split mode the waves of a work-group can be executing on
+ //
diff erent CUs. Therefore need to invalidate the L1 which is per CU.
+ // Otherwise in non-threadgroup split mode all waves of a work-group are
+ // on the same CU, and so the L1 does not need to be invalidated.
+ if (ST.isTgSplitEnabled()) {
+ // Ensures L1 is invalidated if in threadgroup split mode. In
+ // non-threadgroup split mode it is a NOP, but no point generating it in
+ // that case if know not in that mode.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
+ // Set SC bits to indicate work-group scope.
+ .addImm(AMDGPU::CPol::SC0);
+ // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
+ // does not reorder memory operations with respect to preceeding buffer
+ // invalidate. The invalidate is guaranteed to remove any cache lines of
+ // earlier writes and ensures later writes will refetch the cache lines.
+ Changed = true;
+ }
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Could generate "BUFFER_INV" but it would do nothing as there are no
+ // caches to invalidate.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ /// The scratch address space does not need the global memory cache
+ /// to be flushed as all memory operations by the same thread are
+ /// sequentially consistent, and no other thread can access scratch
+ /// memory.
+
+ /// Other address spaces do not have a cache.
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ return Changed;
+}
+
+bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
+ // hardware does not reorder memory operations by the same wave with
+ // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
+ // to initiate writeback of any dirty cache lines of earlier writes by the
+ // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
+ // writeback has completed.
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+ // Set SC bits to indicate system scope.
+ .addImm(AMDGPU::CPol::SC0 | AMDGPU::CPol::SC1);
+ // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
+ // SIAtomicScope::SYSTEM, the following insertWait will generate the
+ // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
+ Changed = true;
+ break;
+ case SIAtomicScope::AGENT:
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
+ // Set SC bits to indicate agent scope.
+ .addImm(AMDGPU::CPol::SC1);
+
+ // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
+ // SIAtomicScope::AGENT, the following insertWait will generate the
+ // required "S_WAITCNT vmcnt(0)".
+ Changed = true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // Do not generate "BUFFER_WBL2" as there are no caches it would
+ // writeback, and would require an otherwise unnecessary
+ // "S_WAITCNT vmcnt(0)".
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
+ // S_WAITCNT needed.
+ Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+ IsCrossAddrSpaceOrdering, Pos);
+
+ return Changed;
+}
+
bool SIGfx10CacheControl::enableLoadCacheBypass(
const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 06dc89fcaa720..b9140890b5c4c 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -31,12 +31,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(float* %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: buffer_wbl2
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_invl2
-; GFX940-NEXT: buffer_wbinvl1_vol
+; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_endpgm
%ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst
ret void
@@ -49,12 +48,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(float* %ptr) #0 {
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: buffer_wbl2
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_invl2
-; GFX940-NEXT: buffer_wbinvl1_vol
+; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_endpgm
%ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst
ret void
@@ -76,12 +74,11 @@ define float @flat_atomic_fadd_f32_rtn_pat(float* %ptr, float %data) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
-; GFX940-NEXT: buffer_wbl2
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
+; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_invl2
-; GFX940-NEXT: buffer_wbinvl1_vol
+; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
%ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst
ret float %ret
@@ -195,12 +192,11 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(3)
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-NEXT: buffer_wbl2
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_pk_add_bf16 v0, v1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_invl2
-; GFX940-NEXT: buffer_wbinvl1_vol
+; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_endpgm
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data)
ret void
@@ -210,12 +206,11 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(3)* %ptr, <2
; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_invl2
-; GFX940-NEXT: buffer_wbinvl1_vol
+; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data)
ret <2 x i16> %ret
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index 8e20ea43fed9d..26201f2819601 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @singlethread_acquire_fence() {
; GFX6-LABEL: singlethread_acquire_fence:
@@ -35,6 +37,14 @@ define amdgpu_kernel void @singlethread_acquire_fence() {
; GFX90A-TGSPLIT-LABEL: singlethread_acquire_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: singlethread_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: singlethread_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("singlethread") acquire
ret void
@@ -68,6 +78,14 @@ define amdgpu_kernel void @singlethread_release_fence() {
; GFX90A-TGSPLIT-LABEL: singlethread_release_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: singlethread_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: singlethread_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("singlethread") release
ret void
@@ -101,6 +119,14 @@ define amdgpu_kernel void @singlethread_acq_rel_fence() {
; GFX90A-TGSPLIT-LABEL: singlethread_acq_rel_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: singlethread_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: singlethread_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("singlethread") acq_rel
ret void
@@ -134,6 +160,14 @@ define amdgpu_kernel void @singlethread_seq_cst_fence() {
; GFX90A-TGSPLIT-LABEL: singlethread_seq_cst_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: singlethread_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: singlethread_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("singlethread") seq_cst
ret void
@@ -167,6 +201,14 @@ define amdgpu_kernel void @singlethread_one_as_acquire_fence() {
; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acquire_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: singlethread_one_as_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: singlethread_one_as_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") acquire
ret void
@@ -200,6 +242,14 @@ define amdgpu_kernel void @singlethread_one_as_release_fence() {
; GFX90A-TGSPLIT-LABEL: singlethread_one_as_release_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: singlethread_one_as_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: singlethread_one_as_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") release
ret void
@@ -233,6 +283,14 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() {
; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acq_rel_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: singlethread_one_as_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: singlethread_one_as_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") acq_rel
ret void
@@ -266,6 +324,14 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() {
; GFX90A-TGSPLIT-LABEL: singlethread_one_as_seq_cst_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: singlethread_one_as_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: singlethread_one_as_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") seq_cst
ret void
@@ -299,6 +365,14 @@ define amdgpu_kernel void @wavefront_acquire_fence() {
; GFX90A-TGSPLIT-LABEL: wavefront_acquire_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: wavefront_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: wavefront_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("wavefront") acquire
ret void
@@ -332,6 +406,14 @@ define amdgpu_kernel void @wavefront_release_fence() {
; GFX90A-TGSPLIT-LABEL: wavefront_release_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: wavefront_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: wavefront_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("wavefront") release
ret void
@@ -365,6 +447,14 @@ define amdgpu_kernel void @wavefront_acq_rel_fence() {
; GFX90A-TGSPLIT-LABEL: wavefront_acq_rel_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: wavefront_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: wavefront_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("wavefront") acq_rel
ret void
@@ -398,6 +488,14 @@ define amdgpu_kernel void @wavefront_seq_cst_fence() {
; GFX90A-TGSPLIT-LABEL: wavefront_seq_cst_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: wavefront_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: wavefront_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("wavefront") seq_cst
ret void
@@ -431,6 +529,14 @@ define amdgpu_kernel void @wavefront_one_as_acquire_fence() {
; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acquire_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: wavefront_one_as_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: wavefront_one_as_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") acquire
ret void
@@ -464,6 +570,14 @@ define amdgpu_kernel void @wavefront_one_as_release_fence() {
; GFX90A-TGSPLIT-LABEL: wavefront_one_as_release_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: wavefront_one_as_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: wavefront_one_as_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") release
ret void
@@ -497,6 +611,14 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() {
; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acq_rel_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: wavefront_one_as_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: wavefront_one_as_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") acq_rel
ret void
@@ -530,6 +652,14 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() {
; GFX90A-TGSPLIT-LABEL: wavefront_one_as_seq_cst_fence:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: wavefront_one_as_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: wavefront_one_as_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") seq_cst
ret void
@@ -573,6 +703,17 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: workgroup_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: workgroup_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acquire
ret void
@@ -614,6 +755,16 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: workgroup_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: workgroup_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("workgroup") release
ret void
@@ -657,6 +808,17 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: workgroup_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acq_rel
ret void
@@ -700,6 +862,17 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: workgroup_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("workgroup") seq_cst
ret void
@@ -738,6 +911,16 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: workgroup_one_as_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acquire
ret void
@@ -774,6 +957,15 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: workgroup_one_as_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") release
ret void
@@ -812,6 +1004,16 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acq_rel
ret void
@@ -850,6 +1052,16 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") seq_cst
ret void
@@ -900,6 +1112,20 @@ define amdgpu_kernel void @agent_acquire_fence() {
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: agent_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: agent_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("agent") acquire
ret void
@@ -942,6 +1168,18 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: agent_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: agent_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("agent") release
ret void
@@ -992,6 +1230,20 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: agent_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: agent_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("agent") acq_rel
ret void
@@ -1042,6 +1294,20 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: agent_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: agent_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("agent") seq_cst
ret void
@@ -1092,6 +1358,20 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: agent_one_as_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: agent_one_as_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acquire
ret void
@@ -1134,6 +1414,18 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: agent_one_as_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: agent_one_as_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") release
ret void
@@ -1184,6 +1476,20 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: agent_one_as_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acq_rel
ret void
@@ -1234,6 +1540,20 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: agent_one_as_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") seq_cst
ret void
@@ -1288,6 +1608,20 @@ define amdgpu_kernel void @system_acquire_fence() {
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: system_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: system_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence acquire
ret void
@@ -1332,6 +1666,18 @@ define amdgpu_kernel void @system_release_fence() {
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: system_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: system_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence release
ret void
@@ -1386,6 +1732,20 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: system_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: system_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence acq_rel
ret void
@@ -1440,6 +1800,20 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: system_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: system_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence seq_cst
ret void
@@ -1494,6 +1868,20 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: system_one_as_acquire_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: system_one_as_acquire_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("one-as") acquire
ret void
@@ -1538,6 +1926,18 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: system_one_as_release_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: system_one_as_release_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("one-as") release
ret void
@@ -1592,6 +1992,20 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: system_one_as_acq_rel_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("one-as") acq_rel
ret void
@@ -1646,6 +2060,20 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: system_one_as_seq_cst_fence:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
entry:
fence syncscope("one-as") seq_cst
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 4ef2355f2bd03..56f1ec99daba9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -5,6 +5,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @flat_agent_unordered_load(
; GFX7-LABEL: flat_agent_unordered_load:
@@ -84,6 +86,32 @@ define amdgpu_kernel void @flat_agent_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("agent") unordered, align 4
@@ -169,6 +197,32 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4
@@ -261,6 +315,34 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("agent") acquire, align 4
@@ -361,6 +443,36 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4
@@ -434,6 +546,26 @@ define amdgpu_kernel void @flat_agent_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4
@@ -506,6 +638,26 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4
@@ -586,6 +738,30 @@ define amdgpu_kernel void @flat_agent_release_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent") release, align 4
@@ -666,6 +842,30 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4
@@ -738,6 +938,26 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") monotonic
@@ -825,6 +1045,30 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire
@@ -905,6 +1149,30 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") release
@@ -1000,6 +1268,34 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel
@@ -1095,6 +1391,34 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst
@@ -1186,6 +1510,32 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire
@@ -1286,6 +1636,36 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel
@@ -1386,6 +1766,36 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst
@@ -1471,6 +1881,26 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1571,6 +2001,30 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1664,6 +2118,30 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1772,6 +2250,34 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1880,6 +2386,34 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1980,6 +2514,30 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2080,6 +2638,30 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2188,6 +2770,34 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2296,6 +2906,34 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2404,6 +3042,34 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2512,6 +3178,34 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2620,6 +3314,34 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2728,6 +3450,34 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2836,6 +3586,34 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2944,6 +3722,34 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3049,6 +3855,30 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3163,6 +3993,32 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3278,6 +4134,34 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3400,6 +4284,36 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3522,6 +4436,36 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3636,6 +4580,32 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3750,6 +4720,32 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3872,6 +4868,36 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3994,6 +5020,36 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4116,6 +5172,36 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4238,6 +5324,36 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4360,6 +5476,36 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4482,6 +5628,36 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4604,6 +5780,36 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4726,6 +5932,36 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4813,6 +6049,32 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("agent-one-as") unordered, align 4
@@ -4898,6 +6160,32 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("agent-one-as") monotonic, align 4
@@ -4995,6 +6283,35 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("agent-one-as") acquire, align 4
@@ -5100,6 +6417,37 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("agent-one-as") seq_cst, align 4
@@ -5173,6 +6521,26 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent-one-as") unordered, align 4
@@ -5245,6 +6613,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent-one-as") monotonic, align 4
@@ -5325,6 +6713,30 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent-one-as") release, align 4
@@ -5405,6 +6817,30 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent-one-as") seq_cst, align 4
@@ -5477,6 +6913,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") monotonic
@@ -5562,6 +7018,30 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire
@@ -5642,6 +7122,30 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") release
@@ -5735,6 +7239,34 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -5828,6 +7360,34 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -5923,6 +7483,33 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire
@@ -6027,6 +7614,37 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -6131,6 +7749,37 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -6216,6 +7865,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6314,6 +7983,30 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6407,6 +8100,30 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6513,6 +8230,34 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6619,6 +8364,34 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6717,6 +8490,30 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6815,6 +8612,30 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6921,6 +8742,34 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7027,6 +8876,34 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7133,6 +9010,34 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7239,6 +9144,34 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7345,6 +9278,34 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7451,6 +9412,34 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7557,6 +9546,34 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7663,6 +9680,34 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7768,6 +9813,30 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7887,6 +9956,33 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8002,6 +10098,34 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8129,6 +10253,37 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8256,6 +10411,37 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8375,6 +10561,33 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8494,6 +10707,33 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8621,6 +10861,37 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8748,6 +11019,37 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8875,6 +11177,37 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9002,6 +11335,37 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9129,6 +11493,37 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9256,6 +11651,37 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9383,6 +11809,37 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9510,6 +11967,37 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index ab79a4cb73af5..89129f7764844 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -5,6 +5,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX7-LABEL: flat_nontemporal_load_0:
@@ -84,6 +86,32 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_nontemporal_load_0:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load i32, i32* %in, align 4, !nontemporal !0
@@ -179,6 +207,36 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_nontemporal_load_1:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX940-TGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -266,6 +324,32 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_nontemporal_store_0:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load i32, i32* %in, align 4
@@ -361,6 +445,36 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX940-NOTTGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_nontemporal_store_1:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX940-TGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index 49f2733fa2acd..b2b1484350683 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -5,6 +5,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @flat_singlethread_unordered_load(
; GFX7-LABEL: flat_singlethread_unordered_load:
@@ -84,6 +86,32 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4
@@ -169,6 +197,32 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4
@@ -254,6 +308,32 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4
@@ -339,6 +419,32 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4
@@ -412,6 +518,26 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4
@@ -484,6 +610,26 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4
@@ -556,6 +702,26 @@ define amdgpu_kernel void @flat_singlethread_release_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4
@@ -628,6 +794,26 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4
@@ -700,6 +886,26 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") monotonic
@@ -772,6 +978,26 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire
@@ -844,6 +1070,26 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") release
@@ -916,6 +1162,26 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel
@@ -988,6 +1254,26 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst
@@ -1072,6 +1358,30 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire
@@ -1157,6 +1467,30 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel
@@ -1242,6 +1576,30 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst
@@ -1327,6 +1685,26 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1412,6 +1790,26 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1497,6 +1895,26 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1582,6 +2000,26 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1667,6 +2105,26 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1752,6 +2210,26 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1837,6 +2315,26 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1922,6 +2420,26 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2007,6 +2525,26 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2092,6 +2630,26 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2177,6 +2735,26 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2262,6 +2840,26 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2347,6 +2945,26 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2432,6 +3050,26 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2517,6 +3155,26 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2622,6 +3280,30 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2729,6 +3411,30 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2836,6 +3542,30 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2943,6 +3673,30 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3050,6 +3804,30 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3157,6 +3935,30 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3264,6 +4066,30 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3371,6 +4197,30 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3478,6 +4328,30 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3585,6 +4459,30 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3692,6 +4590,30 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3799,6 +4721,30 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3906,6 +4852,30 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4013,6 +4983,30 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4120,6 +5114,30 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4207,6 +5225,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4
@@ -4292,6 +5336,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4
@@ -4377,6 +5447,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4
@@ -4462,6 +5558,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4
@@ -4535,6 +5657,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4
@@ -4607,6 +5749,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4
@@ -4679,6 +5841,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4
@@ -4751,6 +5933,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -4823,6 +6025,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -4895,6 +6117,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -4967,6 +6209,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") release
@@ -5039,6 +6301,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -5111,6 +6393,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -5195,6 +6497,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -5280,6 +6606,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -5365,6 +6715,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -5450,6 +6824,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5535,6 +6929,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5620,6 +7034,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5705,6 +7139,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5790,6 +7244,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5875,6 +7349,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5960,6 +7454,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6045,6 +7559,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6130,6 +7664,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6215,6 +7769,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6300,6 +7874,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6385,6 +7979,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6470,6 +8084,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6555,6 +8189,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6640,6 +8294,26 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6745,6 +8419,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6852,6 +8550,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6959,6 +8681,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7066,6 +8812,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7173,6 +8943,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7280,6 +9074,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7387,6 +9205,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7494,6 +9336,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7601,6 +9467,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7708,6 +9598,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7815,6 +9729,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7922,6 +9860,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8029,6 +9991,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8136,6 +10122,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8243,6 +10253,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index b9a307074492b..f9318d31b68a4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -5,6 +5,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @flat_system_unordered_load(
; GFX7-LABEL: flat_system_unordered_load:
@@ -84,6 +86,32 @@ define amdgpu_kernel void @flat_system_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in unordered, align 4
@@ -169,6 +197,32 @@ define amdgpu_kernel void @flat_system_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in monotonic, align 4
@@ -263,6 +317,34 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in acquire, align 4
@@ -365,6 +447,36 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in seq_cst, align 4
@@ -438,6 +550,26 @@ define amdgpu_kernel void @flat_system_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out unordered, align 4
@@ -510,6 +642,26 @@ define amdgpu_kernel void @flat_system_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out monotonic, align 4
@@ -592,6 +744,30 @@ define amdgpu_kernel void @flat_system_release_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out release, align 4
@@ -674,6 +850,30 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out seq_cst, align 4
@@ -746,6 +946,26 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in monotonic
@@ -835,6 +1055,30 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in acquire
@@ -917,6 +1161,30 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in release
@@ -1016,6 +1284,34 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel
@@ -1115,6 +1411,34 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst
@@ -1208,6 +1532,32 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in acquire
@@ -1312,6 +1662,36 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel
@@ -1416,6 +1796,36 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst
@@ -1501,6 +1911,26 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1603,6 +2033,30 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1698,6 +2152,30 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1810,6 +2288,34 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1922,6 +2428,34 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2024,6 +2558,30 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2126,6 +2684,30 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2238,6 +2820,34 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2350,6 +2960,34 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2462,6 +3100,34 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2574,6 +3240,34 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2686,6 +3380,34 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2798,6 +3520,34 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2910,6 +3660,34 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3022,6 +3800,34 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3127,6 +3933,30 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3243,6 +4073,32 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3360,6 +4216,34 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3486,6 +4370,36 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3612,6 +4526,36 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3728,6 +4672,32 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3844,6 +4814,32 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3970,6 +4966,36 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4096,6 +5122,36 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4222,6 +5278,36 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4348,6 +5434,36 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4474,6 +5590,36 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4600,6 +5746,36 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4726,6 +5902,36 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4852,6 +6058,36 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4939,6 +6175,32 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("one-as") unordered, align 4
@@ -5024,6 +6286,32 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("one-as") monotonic, align 4
@@ -5123,6 +6411,35 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("one-as") acquire, align 4
@@ -5230,6 +6547,37 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("one-as") seq_cst, align 4
@@ -5303,6 +6651,26 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("one-as") unordered, align 4
@@ -5375,6 +6743,26 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("one-as") monotonic, align 4
@@ -5457,6 +6845,30 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("one-as") release, align 4
@@ -5539,6 +6951,30 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("one-as") seq_cst, align 4
@@ -5611,6 +7047,26 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") monotonic
@@ -5698,6 +7154,30 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire
@@ -5780,6 +7260,30 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") release
@@ -5877,6 +7381,34 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel
@@ -5974,6 +7506,34 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst
@@ -6071,6 +7631,33 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire
@@ -6179,6 +7766,37 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel
@@ -6287,6 +7905,37 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst
@@ -6372,6 +8021,26 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6472,6 +8141,30 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6567,6 +8260,30 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6677,6 +8394,34 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6787,6 +8532,34 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6887,6 +8660,30 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6987,6 +8784,30 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7097,6 +8918,34 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7207,6 +9056,34 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7317,6 +9194,34 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7427,6 +9332,34 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7537,6 +9470,34 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7647,6 +9608,34 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7757,6 +9746,34 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7867,6 +9884,34 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7972,6 +10017,30 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8093,6 +10162,33 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8210,6 +10306,34 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8341,6 +10465,37 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8472,6 +10627,37 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8593,6 +10779,33 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8714,6 +10927,33 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8845,6 +11085,37 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8976,6 +11247,37 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9107,6 +11409,37 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9238,6 +11571,37 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9369,6 +11733,37 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9500,6 +11895,37 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9631,6 +12057,37 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9762,6 +12219,37 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index 1569af9d257f2..062f981230ba0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -5,6 +5,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @flat_wavefront_unordered_load(
; GFX7-LABEL: flat_wavefront_unordered_load:
@@ -84,6 +86,32 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4
@@ -169,6 +197,32 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4
@@ -254,6 +308,32 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4
@@ -339,6 +419,32 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4
@@ -412,6 +518,26 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4
@@ -484,6 +610,26 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4
@@ -556,6 +702,26 @@ define amdgpu_kernel void @flat_wavefront_release_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4
@@ -628,6 +794,26 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4
@@ -700,6 +886,26 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") monotonic
@@ -772,6 +978,26 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire
@@ -844,6 +1070,26 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") release
@@ -916,6 +1162,26 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel
@@ -988,6 +1254,26 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst
@@ -1072,6 +1358,30 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire
@@ -1157,6 +1467,30 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel
@@ -1242,6 +1576,30 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst
@@ -1327,6 +1685,26 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1412,6 +1790,26 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1497,6 +1895,26 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1582,6 +2000,26 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1667,6 +2105,26 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1752,6 +2210,26 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1837,6 +2315,26 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1922,6 +2420,26 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2007,6 +2525,26 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2092,6 +2630,26 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2177,6 +2735,26 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2262,6 +2840,26 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2347,6 +2945,26 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2432,6 +3050,26 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2517,6 +3155,26 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2622,6 +3280,30 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2729,6 +3411,30 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2836,6 +3542,30 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2943,6 +3673,30 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3050,6 +3804,30 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3157,6 +3935,30 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3264,6 +4066,30 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3371,6 +4197,30 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3478,6 +4328,30 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3585,6 +4459,30 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3692,6 +4590,30 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3799,6 +4721,30 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3906,6 +4852,30 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4013,6 +4983,30 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4120,6 +5114,30 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4207,6 +5225,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("wavefront-one-as") unordered, align 4
@@ -4292,6 +5336,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("wavefront-one-as") monotonic, align 4
@@ -4377,6 +5447,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("wavefront-one-as") acquire, align 4
@@ -4462,6 +5558,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("wavefront-one-as") seq_cst, align 4
@@ -4535,6 +5657,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront-one-as") unordered, align 4
@@ -4607,6 +5749,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront-one-as") monotonic, align 4
@@ -4679,6 +5841,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront-one-as") release, align 4
@@ -4751,6 +5933,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -4823,6 +6025,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -4895,6 +6117,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -4967,6 +6209,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") release
@@ -5039,6 +6301,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -5111,6 +6393,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -5195,6 +6497,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -5280,6 +6606,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -5365,6 +6715,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -5450,6 +6824,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5535,6 +6929,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5620,6 +7034,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5705,6 +7139,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5790,6 +7244,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5875,6 +7349,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5960,6 +7454,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6045,6 +7559,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6130,6 +7664,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6215,6 +7769,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6300,6 +7874,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6385,6 +7979,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6470,6 +8084,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6555,6 +8189,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6640,6 +8294,26 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6745,6 +8419,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6852,6 +8550,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6959,6 +8681,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7066,6 +8812,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7173,6 +8943,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7280,6 +9074,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7387,6 +9205,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7494,6 +9336,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7601,6 +9467,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7708,6 +9598,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7815,6 +9729,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7922,6 +9860,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8029,6 +9991,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8136,6 +10122,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index ead08ed9fb2bd..cc06b9be0d5ac 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -5,6 +5,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @flat_workgroup_unordered_load(
; GFX7-LABEL: flat_workgroup_unordered_load:
@@ -84,6 +86,32 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4
@@ -169,6 +197,32 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4
@@ -260,6 +314,34 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4
@@ -358,6 +440,36 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4
@@ -431,6 +543,26 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup") unordered, align 4
@@ -503,6 +635,26 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup") monotonic, align 4
@@ -582,6 +734,28 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup") release, align 4
@@ -661,6 +835,28 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup") seq_cst, align 4
@@ -733,6 +929,26 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") monotonic
@@ -814,6 +1030,29 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire
@@ -893,6 +1132,28 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") release
@@ -981,6 +1242,31 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel
@@ -1069,6 +1355,31 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst
@@ -1155,6 +1466,31 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire
@@ -1249,6 +1585,33 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel
@@ -1343,6 +1706,33 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst
@@ -1428,6 +1818,26 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1522,6 +1932,29 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1614,6 +2047,28 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1715,6 +2170,31 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1816,6 +2296,31 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1910,6 +2415,29 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2004,6 +2532,29 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2105,6 +2656,31 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2206,6 +2782,31 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2307,6 +2908,31 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2408,6 +3034,31 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2513,6 +3164,30 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2625,6 +3300,31 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2739,6 +3439,32 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2858,6 +3584,33 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2977,6 +3730,33 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3089,6 +3869,31 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3201,6 +4006,31 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3320,6 +4150,33 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3439,6 +4296,33 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3558,6 +4442,33 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3677,6 +4588,33 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3796,6 +4734,33 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3915,6 +4880,33 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4034,6 +5026,33 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4153,6 +5172,33 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4240,6 +5286,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup-one-as") unordered, align 4
@@ -4325,6 +5397,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup-one-as") monotonic, align 4
@@ -4413,6 +5511,33 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup-one-as") acquire, align 4
@@ -4504,6 +5629,34 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -4577,6 +5730,26 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup-one-as") unordered, align 4
@@ -4649,6 +5822,26 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup-one-as") monotonic, align 4
@@ -4724,6 +5917,27 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup-one-as") release, align 4
@@ -4799,6 +6013,27 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -4871,6 +6106,26 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -4947,6 +6202,28 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -5022,6 +6299,27 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") release
@@ -5101,6 +6399,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -5180,6 +6501,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -5267,6 +6611,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -5358,6 +6727,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -5449,6 +6844,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -5534,6 +6955,26 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5623,6 +7064,28 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5711,6 +7174,27 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5803,6 +7287,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5895,6 +7402,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5984,6 +7514,28 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6073,6 +7625,28 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6165,6 +7739,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6257,6 +7854,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6349,6 +7969,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6441,6 +8084,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6533,6 +8199,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6625,6 +8314,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6717,6 +8429,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6809,6 +8544,29 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6914,6 +8672,30 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7024,6 +8806,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7134,6 +8941,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7247,6 +9079,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7360,6 +9218,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7470,6 +9354,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7580,6 +9489,31 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7693,6 +9627,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7806,6 +9766,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7919,6 +9905,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8032,6 +10044,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8145,6 +10183,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8258,6 +10322,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8371,6 +10461,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8484,6 +10600,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 43120f5132077..8537aa5b4a309 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @global_agent_unordered_load(
; GFX6-LABEL: global_agent_unordered_load:
@@ -90,6 +92,26 @@ define amdgpu_kernel void @global_agent_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") unordered, align 4
@@ -180,6 +202,26 @@ define amdgpu_kernel void @global_agent_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") monotonic, align 4
@@ -278,6 +320,28 @@ define amdgpu_kernel void @global_agent_acquire_load(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") acquire, align 4
@@ -381,6 +445,28 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") seq_cst, align 4
@@ -463,6 +549,26 @@ define amdgpu_kernel void @global_agent_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") unordered, align 4
@@ -544,6 +650,26 @@ define amdgpu_kernel void @global_agent_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") monotonic, align 4
@@ -634,6 +760,30 @@ define amdgpu_kernel void @global_agent_release_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") release, align 4
@@ -724,6 +874,30 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") seq_cst, align 4
@@ -805,6 +979,26 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") monotonic
@@ -901,6 +1095,30 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acquire
@@ -991,6 +1209,30 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") release
@@ -1096,6 +1338,34 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acq_rel
@@ -1201,6 +1471,34 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") seq_cst
@@ -1304,6 +1602,32 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acquire
@@ -1417,6 +1741,36 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acq_rel
@@ -1530,6 +1884,36 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") seq_cst
@@ -1619,6 +2003,26 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1723,6 +2127,30 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1821,6 +2249,30 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1934,6 +2386,34 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2047,6 +2527,34 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2151,6 +2659,30 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2255,6 +2787,30 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2368,6 +2924,34 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2481,6 +3065,34 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2594,6 +3206,34 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2707,6 +3347,34 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2820,6 +3488,34 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2933,6 +3629,34 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3046,6 +3770,34 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3159,6 +3911,34 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3264,6 +4044,30 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3379,6 +4183,32 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3495,6 +4325,34 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3619,6 +4477,36 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3743,6 +4631,36 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3858,6 +4776,32 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3973,6 +4917,32 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4097,6 +5067,36 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4221,6 +5221,36 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4345,6 +5375,36 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4469,6 +5529,36 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4593,6 +5683,36 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4717,6 +5837,36 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4841,6 +5991,36 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4965,6 +6145,36 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5057,6 +6267,26 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") unordered, align 4
@@ -5147,6 +6377,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") monotonic, align 4
@@ -5245,6 +6495,28 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") acquire, align 4
@@ -5348,6 +6620,28 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") seq_cst, align 4
@@ -5430,6 +6724,26 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") unordered, align 4
@@ -5511,6 +6825,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") monotonic, align 4
@@ -5601,6 +6935,30 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") release, align 4
@@ -5691,6 +7049,30 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") seq_cst, align 4
@@ -5772,6 +7154,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") monotonic
@@ -5868,6 +7270,30 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire
@@ -5958,6 +7384,30 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") release
@@ -6063,6 +7513,34 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -6168,6 +7646,34 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -6271,6 +7777,32 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire
@@ -6384,6 +7916,36 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -6497,6 +8059,36 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -6586,6 +8178,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6690,6 +8302,30 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6788,6 +8424,30 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6901,6 +8561,34 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7014,6 +8702,34 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7118,6 +8834,30 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7222,6 +8962,30 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7335,6 +9099,34 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7448,6 +9240,34 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7561,6 +9381,34 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7674,6 +9522,34 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7787,6 +9663,34 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7900,6 +9804,34 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8013,6 +9945,34 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8126,6 +10086,34 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8231,6 +10219,30 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8346,6 +10358,32 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8470,6 +10508,36 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8594,6 +10662,36 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8709,6 +10807,32 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8824,6 +10948,32 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8948,6 +11098,36 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9072,6 +11252,36 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9196,6 +11406,36 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9320,6 +11560,36 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9444,6 +11714,36 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9568,6 +11868,36 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9692,6 +12022,36 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9816,6 +12176,36 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 02616719c90c6..38062a8ed0f82 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @global_nontemporal_load_0(
; GFX6-LABEL: global_nontemporal_load_0:
@@ -91,6 +93,28 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_load_0:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_nontemporal_load_0:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0
@@ -191,6 +215,28 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_load_1:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v0, v0, s[0:1] nt
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_nontemporal_load_1:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v0, v0, s[0:1] nt
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -284,6 +330,28 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] glc slc
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_store_0:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] nt
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_nontemporal_store_0:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] nt
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@@ -379,6 +447,28 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] glc slc
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_store_1:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] nt
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_nontemporal_store_1:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] nt
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index 949b5a5ce4761..d09b06748fef4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @global_singlethread_unordered_load(
; GFX6-LABEL: global_singlethread_unordered_load:
@@ -90,6 +92,26 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") unordered, align 4
@@ -180,6 +202,26 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") monotonic, align 4
@@ -270,6 +312,26 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") acquire, align 4
@@ -360,6 +422,26 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") seq_cst, align 4
@@ -442,6 +524,26 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") unordered, align 4
@@ -523,6 +625,26 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") monotonic, align 4
@@ -604,6 +726,26 @@ define amdgpu_kernel void @global_singlethread_release_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") release, align 4
@@ -685,6 +827,26 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") seq_cst, align 4
@@ -766,6 +928,26 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") monotonic
@@ -847,6 +1029,26 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acquire
@@ -928,6 +1130,26 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") release
@@ -1009,6 +1231,26 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acq_rel
@@ -1090,6 +1332,26 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") seq_cst
@@ -1185,6 +1447,30 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acquire
@@ -1281,6 +1567,30 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acq_rel
@@ -1377,6 +1687,30 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") seq_cst
@@ -1466,6 +1800,26 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1555,6 +1909,26 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1644,6 +2018,26 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1733,6 +2127,26 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1822,6 +2236,26 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1911,6 +2345,26 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2000,6 +2454,26 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2089,6 +2563,26 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2178,6 +2672,26 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2267,6 +2781,26 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2356,6 +2890,26 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2445,6 +2999,26 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2534,6 +3108,26 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2623,6 +3217,26 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2712,6 +3326,26 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2817,6 +3451,30 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2924,6 +3582,30 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3031,6 +3713,30 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3138,6 +3844,30 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3245,6 +3975,30 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3352,6 +4106,30 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3459,6 +4237,30 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3566,6 +4368,30 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3673,6 +4499,30 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3780,6 +4630,30 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3887,6 +4761,30 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3994,6 +4892,30 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4101,6 +5023,30 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4208,6 +5154,30 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4315,6 +5285,30 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4407,6 +5401,26 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") unordered, align 4
@@ -4497,6 +5511,26 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") monotonic, align 4
@@ -4587,6 +5621,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") acquire, align 4
@@ -4677,6 +5731,26 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") seq_cst, align 4
@@ -4759,6 +5833,26 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") unordered, align 4
@@ -4840,6 +5934,26 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") monotonic, align 4
@@ -4921,6 +6035,26 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") release, align 4
@@ -5002,6 +6136,26 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -5083,6 +6237,26 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -5164,6 +6338,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -5245,6 +6439,26 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") release
@@ -5326,6 +6540,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -5407,6 +6641,26 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -5502,6 +6756,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -5598,6 +6876,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -5694,6 +6996,30 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -5783,6 +7109,26 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5872,6 +7218,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5961,6 +7327,26 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6050,6 +7436,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6139,6 +7545,26 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6228,6 +7654,26 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6317,6 +7763,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6406,6 +7872,26 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6495,6 +7981,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6584,6 +8090,26 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6673,6 +8199,26 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6762,6 +8308,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6851,6 +8417,26 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6940,6 +8526,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7029,6 +8635,26 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7134,6 +8760,30 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7241,6 +8891,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7348,6 +9022,30 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7455,6 +9153,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7562,6 +9284,30 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7669,6 +9415,30 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7776,6 +9546,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7883,6 +9677,30 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7990,6 +9808,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8097,6 +9939,30 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8204,6 +10070,30 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8311,6 +10201,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8418,6 +10332,30 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8525,6 +10463,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8632,6 +10594,30 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 93a58c17153ae..54407c1f53114 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @global_system_unordered_load(
; GFX6-LABEL: global_system_unordered_load:
@@ -90,6 +92,26 @@ define amdgpu_kernel void @global_system_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in unordered, align 4
@@ -180,6 +202,26 @@ define amdgpu_kernel void @global_system_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in monotonic, align 4
@@ -280,6 +322,28 @@ define amdgpu_kernel void @global_system_acquire_load(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in acquire, align 4
@@ -385,6 +449,28 @@ define amdgpu_kernel void @global_system_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4
@@ -467,6 +553,26 @@ define amdgpu_kernel void @global_system_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out unordered, align 4
@@ -548,6 +654,26 @@ define amdgpu_kernel void @global_system_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out monotonic, align 4
@@ -640,6 +766,30 @@ define amdgpu_kernel void @global_system_release_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out release, align 4
@@ -732,6 +882,30 @@ define amdgpu_kernel void @global_system_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
@@ -813,6 +987,26 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in monotonic
@@ -911,6 +1105,30 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire
@@ -1003,6 +1221,30 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in release
@@ -1112,6 +1354,34 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel
@@ -1221,6 +1491,34 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
@@ -1326,6 +1624,32 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire
@@ -1443,6 +1767,36 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel
@@ -1560,6 +1914,36 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
@@ -1649,6 +2033,26 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1755,6 +2159,30 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1855,6 +2283,30 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1972,6 +2424,34 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2089,6 +2569,34 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2195,6 +2703,30 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2301,6 +2833,30 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2418,6 +2974,34 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2535,6 +3119,34 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2652,6 +3264,34 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2769,6 +3409,34 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2874,6 +3542,30 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2991,6 +3683,32 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3119,6 +3837,36 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3247,6 +3995,36 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3364,6 +4142,32 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3481,6 +4285,32 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3609,6 +4439,36 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3737,6 +4597,36 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3865,6 +4755,36 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3993,6 +4913,36 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4121,6 +5071,36 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4249,6 +5229,36 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4377,6 +5387,36 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4505,6 +5545,36 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4597,6 +5667,26 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") unordered, align 4
@@ -4687,6 +5777,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") monotonic, align 4
@@ -4787,6 +5897,28 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") acquire, align 4
@@ -4892,6 +6024,28 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") seq_cst, align 4
@@ -4974,6 +6128,26 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") unordered, align 4
@@ -5055,6 +6229,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") monotonic, align 4
@@ -5147,6 +6341,30 @@ define amdgpu_kernel void @global_system_one_as_release_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") release, align 4
@@ -5239,6 +6457,30 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") seq_cst, align 4
@@ -5320,6 +6562,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") monotonic
@@ -5418,6 +6680,30 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire
@@ -5510,6 +6796,30 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") release
@@ -5619,6 +6929,34 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel
@@ -5728,6 +7066,34 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst
@@ -5833,6 +7199,32 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire
@@ -5950,6 +7342,36 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel
@@ -6067,6 +7489,36 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst
@@ -6156,6 +7608,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6262,6 +7734,30 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6362,6 +7858,30 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6479,6 +7999,34 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6596,6 +8144,34 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6702,6 +8278,30 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6808,6 +8408,30 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6925,6 +8549,34 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7042,6 +8694,34 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7159,6 +8839,34 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7276,6 +8984,34 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7393,6 +9129,34 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7510,6 +9274,34 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7627,6 +9419,34 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7744,6 +9564,34 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7849,6 +9697,30 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7966,6 +9838,32 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8084,6 +9982,34 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8212,6 +10138,36 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8340,6 +10296,36 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8457,6 +10443,32 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8574,6 +10586,32 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8702,6 +10740,36 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8830,6 +10898,36 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8958,6 +11056,36 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9086,6 +11214,36 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9214,6 +11372,36 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9342,6 +11530,36 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9470,6 +11688,36 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9598,6 +11846,36 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index 3fe2c7bbc5e30..32e62b36b322d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @global_wavefront_unordered_load(
; GFX6-LABEL: global_wavefront_unordered_load:
@@ -90,6 +92,26 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") unordered, align 4
@@ -180,6 +202,26 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") monotonic, align 4
@@ -270,6 +312,26 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") acquire, align 4
@@ -360,6 +422,26 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") seq_cst, align 4
@@ -442,6 +524,26 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") unordered, align 4
@@ -523,6 +625,26 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") monotonic, align 4
@@ -604,6 +726,26 @@ define amdgpu_kernel void @global_wavefront_release_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") release, align 4
@@ -685,6 +827,26 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") seq_cst, align 4
@@ -766,6 +928,26 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") monotonic
@@ -847,6 +1029,26 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acquire
@@ -928,6 +1130,26 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") release
@@ -1009,6 +1231,26 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acq_rel
@@ -1090,6 +1332,26 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") seq_cst
@@ -1185,6 +1447,30 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acquire
@@ -1281,6 +1567,30 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acq_rel
@@ -1377,6 +1687,30 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") seq_cst
@@ -1466,6 +1800,26 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1555,6 +1909,26 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1644,6 +2018,26 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1733,6 +2127,26 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1822,6 +2236,26 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1911,6 +2345,26 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2000,6 +2454,26 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2089,6 +2563,26 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2178,6 +2672,26 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2267,6 +2781,26 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2356,6 +2890,26 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2445,6 +2999,26 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2534,6 +3108,26 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2623,6 +3217,26 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2712,6 +3326,26 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2817,6 +3451,30 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2924,6 +3582,30 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3031,6 +3713,30 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3138,6 +3844,30 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3245,6 +3975,30 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3352,6 +4106,30 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3459,6 +4237,30 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3566,6 +4368,30 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3673,6 +4499,30 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3780,6 +4630,30 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3887,6 +4761,30 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3994,6 +4892,30 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4101,6 +5023,30 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4208,6 +5154,30 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4315,6 +5285,30 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4407,6 +5401,26 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") unordered, align 4
@@ -4497,6 +5511,26 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") monotonic, align 4
@@ -4587,6 +5621,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") acquire, align 4
@@ -4677,6 +5731,26 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") seq_cst, align 4
@@ -4759,6 +5833,26 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") unordered, align 4
@@ -4840,6 +5934,26 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") monotonic, align 4
@@ -4921,6 +6035,26 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") release, align 4
@@ -5002,6 +6136,26 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -5083,6 +6237,26 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -5164,6 +6338,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -5245,6 +6439,26 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") release
@@ -5326,6 +6540,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -5407,6 +6641,26 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -5502,6 +6756,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -5598,6 +6876,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -5694,6 +6996,30 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -5783,6 +7109,26 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5872,6 +7218,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5961,6 +7327,26 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6050,6 +7436,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6139,6 +7545,26 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6228,6 +7654,26 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6317,6 +7763,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6406,6 +7872,26 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6495,6 +7981,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6584,6 +8090,26 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6673,6 +8199,26 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6762,6 +8308,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6851,6 +8417,26 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6940,6 +8526,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7029,6 +8635,26 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7134,6 +8760,30 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7241,6 +8891,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7348,6 +9022,30 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7455,6 +9153,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7562,6 +9284,30 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7669,6 +9415,30 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7776,6 +9546,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7883,6 +9677,30 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7990,6 +9808,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8097,6 +9939,30 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8204,6 +10070,30 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8311,6 +10201,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8418,6 +10332,30 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8525,6 +10463,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8632,6 +10594,30 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 339cb981b3f2c..7fa48739f63b5 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @global_workgroup_unordered_load(
; GFX6-LABEL: global_workgroup_unordered_load:
@@ -90,6 +92,26 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") unordered, align 4
@@ -180,6 +202,26 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") monotonic, align 4
@@ -272,6 +314,27 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") acquire, align 4
@@ -368,6 +431,27 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") seq_cst, align 4
@@ -450,6 +534,26 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") unordered, align 4
@@ -531,6 +635,26 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") monotonic, align 4
@@ -620,6 +744,28 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") release, align 4
@@ -709,6 +855,28 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") seq_cst, align 4
@@ -790,6 +958,26 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") monotonic
@@ -875,6 +1063,28 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire
@@ -964,6 +1174,28 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") release
@@ -1057,6 +1289,30 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel
@@ -1150,6 +1406,30 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst
@@ -1247,6 +1527,31 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire
@@ -1353,6 +1658,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel
@@ -1459,6 +1791,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst
@@ -1548,6 +1907,26 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1641,6 +2020,28 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1738,6 +2139,28 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1839,6 +2262,30 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1940,6 +2387,30 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2033,6 +2504,28 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2126,6 +2619,28 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2227,6 +2742,30 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2328,6 +2867,30 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2429,6 +2992,30 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2530,6 +3117,30 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2631,6 +3242,30 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2732,6 +3367,30 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2833,6 +3492,30 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2934,6 +3617,30 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3039,6 +3746,30 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3148,6 +3879,31 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3263,6 +4019,32 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3380,6 +4162,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3497,6 +4306,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3606,6 +4442,31 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3715,6 +4576,31 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3832,6 +4718,33 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3949,6 +4862,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4066,6 +5006,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4183,6 +5150,33 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4300,6 +5294,33 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4417,6 +5438,33 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4534,6 +5582,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4651,6 +5726,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4743,6 +5845,26 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") unordered, align 4
@@ -4833,6 +5955,26 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") monotonic, align 4
@@ -4925,6 +6067,27 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") acquire, align 4
@@ -5018,6 +6181,27 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -5100,6 +6284,26 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") unordered, align 4
@@ -5181,6 +6385,26 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") monotonic, align 4
@@ -5265,6 +6489,27 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") release, align 4
@@ -5349,6 +6594,27 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -5430,6 +6696,26 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -5515,6 +6801,28 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -5599,6 +6907,27 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") release
@@ -5687,6 +7016,29 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -5775,6 +7127,29 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -5872,6 +7247,31 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -5973,6 +7373,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -6074,6 +7500,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[2:3] sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -6163,6 +7615,26 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6256,6 +7728,28 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6348,6 +7842,27 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6444,6 +7959,29 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6540,6 +8078,29 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6633,6 +8194,28 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6726,6 +8309,28 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6822,6 +8427,29 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6918,6 +8546,29 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7014,6 +8665,29 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7110,6 +8784,29 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7206,6 +8903,29 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7302,6 +9022,29 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7398,6 +9141,29 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7494,6 +9260,29 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7599,6 +9388,30 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7708,6 +9521,31 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7818,6 +9656,31 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7930,6 +9793,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8042,6 +9931,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8151,6 +10066,31 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8260,6 +10200,31 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8372,6 +10337,32 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8484,6 +10475,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8596,6 +10613,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8708,6 +10751,32 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8820,6 +10889,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8932,6 +11027,32 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9044,6 +11165,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9156,6 +11303,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index a328fd34ba1f1..c0dcb96de35a4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @local_agent_unordered_load(
; GFX6-LABEL: local_agent_unordered_load:
@@ -88,6 +90,28 @@ define amdgpu_kernel void @local_agent_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") unordered, align 4
@@ -176,6 +200,28 @@ define amdgpu_kernel void @local_agent_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") monotonic, align 4
@@ -266,6 +312,29 @@ define amdgpu_kernel void @local_agent_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") acquire, align 4
@@ -364,6 +433,31 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") seq_cst, align 4
@@ -438,6 +532,24 @@ define amdgpu_kernel void @local_agent_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") unordered, align 4
@@ -511,6 +623,24 @@ define amdgpu_kernel void @local_agent_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") monotonic, align 4
@@ -592,6 +722,26 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") release, align 4
@@ -673,6 +823,26 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") seq_cst, align 4
@@ -746,6 +916,24 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") monotonic
@@ -827,6 +1015,26 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acquire
@@ -908,6 +1116,26 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") release
@@ -997,6 +1225,28 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acq_rel
@@ -1086,6 +1336,28 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") seq_cst
@@ -1175,6 +1447,29 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acquire
@@ -1273,6 +1568,31 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acq_rel
@@ -1371,6 +1691,31 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") seq_cst
@@ -1452,6 +1797,26 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1541,6 +1906,28 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1630,6 +2017,28 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1727,6 +2136,30 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1824,6 +2257,30 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1913,6 +2370,28 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2002,6 +2481,28 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2099,6 +2600,30 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2196,6 +2721,30 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2293,6 +2842,30 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2390,6 +2963,30 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2487,6 +3084,30 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2584,6 +3205,30 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2681,6 +3326,30 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2778,6 +3447,30 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2873,6 +3566,30 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2972,6 +3689,31 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3077,6 +3819,32 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3184,6 +3952,33 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3291,6 +4086,33 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3390,6 +4212,31 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3489,6 +4336,31 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3596,6 +4468,33 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3703,6 +4602,33 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3810,6 +4736,33 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3917,6 +4870,33 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4024,6 +5004,33 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4131,6 +5138,33 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4238,6 +5272,33 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4345,6 +5406,33 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4435,6 +5523,28 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") unordered, align 4
@@ -4523,6 +5633,28 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") monotonic, align 4
@@ -4611,6 +5743,28 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") acquire, align 4
@@ -4699,6 +5853,28 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") seq_cst, align 4
@@ -4773,6 +5949,24 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") unordered, align 4
@@ -4846,6 +6040,24 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") monotonic, align 4
@@ -4919,6 +6131,24 @@ define amdgpu_kernel void @local_agent_one_as_release_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") release, align 4
@@ -4992,6 +6222,24 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") seq_cst, align 4
@@ -5065,6 +6313,24 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") monotonic
@@ -5138,6 +6404,24 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire
@@ -5211,6 +6495,24 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") release
@@ -5284,6 +6586,24 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -5357,6 +6677,24 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -5444,6 +6782,28 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire
@@ -5532,6 +6892,28 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -5620,6 +7002,28 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -5701,6 +7105,26 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5782,6 +7206,26 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5863,6 +7307,26 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5944,6 +7408,26 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6025,6 +7509,26 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6106,6 +7610,26 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6187,6 +7711,26 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6268,6 +7812,26 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6349,6 +7913,26 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6430,6 +8014,26 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6511,6 +8115,26 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6592,6 +8216,26 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6673,6 +8317,26 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6754,6 +8418,26 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6835,6 +8519,26 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6930,6 +8634,30 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7027,6 +8755,30 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7124,6 +8876,30 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7221,6 +8997,30 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7318,6 +9118,30 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7415,6 +9239,30 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7512,6 +9360,30 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7609,6 +9481,30 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7706,6 +9602,30 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7803,6 +9723,30 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7900,6 +9844,30 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7997,6 +9965,30 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8094,6 +10086,30 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8191,6 +10207,30 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8288,6 +10328,30 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index df3e5226c539f..75a65723b0874 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @local_nontemporal_load_0(
; GFX6-LABEL: local_nontemporal_load_0:
@@ -99,6 +101,30 @@ define amdgpu_kernel void @local_nontemporal_load_0(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_0:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_nontemporal_load_0:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0
@@ -201,6 +227,30 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_nontemporal_load_1:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -299,6 +349,30 @@ define amdgpu_kernel void @local_nontemporal_store_0(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_0:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_nontemporal_store_0:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@@ -398,6 +472,30 @@ define amdgpu_kernel void @local_nontemporal_store_1(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_nontemporal_store_1:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
+; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
index 67ce190072b5b..8e8080ed81ff8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @local_singlethread_unordered_load(
; GFX6-LABEL: local_singlethread_unordered_load:
@@ -88,6 +90,28 @@ define amdgpu_kernel void @local_singlethread_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") unordered, align 4
@@ -176,6 +200,28 @@ define amdgpu_kernel void @local_singlethread_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") monotonic, align 4
@@ -264,6 +310,28 @@ define amdgpu_kernel void @local_singlethread_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") acquire, align 4
@@ -352,6 +420,28 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") seq_cst, align 4
@@ -426,6 +516,24 @@ define amdgpu_kernel void @local_singlethread_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") unordered, align 4
@@ -499,6 +607,24 @@ define amdgpu_kernel void @local_singlethread_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") monotonic, align 4
@@ -572,6 +698,24 @@ define amdgpu_kernel void @local_singlethread_release_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") release, align 4
@@ -645,6 +789,24 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") seq_cst, align 4
@@ -718,6 +880,24 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") monotonic
@@ -791,6 +971,24 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire
@@ -864,6 +1062,24 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") release
@@ -937,6 +1153,24 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel
@@ -1010,6 +1244,24 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst
@@ -1097,6 +1349,28 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire
@@ -1185,6 +1459,28 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel
@@ -1273,6 +1569,28 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst
@@ -1354,6 +1672,26 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1435,6 +1773,26 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1516,6 +1874,26 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1597,6 +1975,26 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1678,6 +2076,26 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1759,6 +2177,26 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1840,6 +2278,26 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1921,6 +2379,26 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2002,6 +2480,26 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2083,6 +2581,26 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2164,6 +2682,26 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2245,6 +2783,26 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2326,6 +2884,26 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2407,6 +2985,26 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2488,6 +3086,26 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2583,6 +3201,30 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2680,6 +3322,30 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2777,6 +3443,30 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2874,6 +3564,30 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2971,6 +3685,30 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3068,6 +3806,30 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3165,6 +3927,30 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3262,6 +4048,30 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3359,6 +4169,30 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3456,6 +4290,30 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3553,6 +4411,30 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3650,6 +4532,30 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3747,6 +4653,30 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3844,6 +4774,30 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3941,6 +4895,30 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4031,6 +5009,28 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") unordered, align 4
@@ -4119,6 +5119,28 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") monotonic, align 4
@@ -4207,6 +5229,28 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") acquire, align 4
@@ -4295,6 +5339,28 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") seq_cst, align 4
@@ -4369,6 +5435,24 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") unordered, align 4
@@ -4442,6 +5526,24 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") monotonic, align 4
@@ -4515,6 +5617,24 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") release, align 4
@@ -4588,6 +5708,24 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -4661,6 +5799,24 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -4734,6 +5890,24 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -4807,6 +5981,24 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") release
@@ -4880,6 +6072,24 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -4953,6 +6163,24 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -5040,6 +6268,28 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -5128,6 +6378,28 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -5216,6 +6488,28 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -5297,6 +6591,26 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5378,6 +6692,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5459,6 +6793,26 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5540,6 +6894,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5621,6 +6995,26 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5702,6 +7096,26 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5783,6 +7197,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5864,6 +7298,26 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5945,6 +7399,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6026,6 +7500,26 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6107,6 +7601,26 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6188,6 +7702,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6269,6 +7803,26 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6350,6 +7904,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6431,6 +8005,26 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6526,6 +8120,30 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6623,6 +8241,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6720,6 +8362,30 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6817,6 +8483,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6914,6 +8604,30 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7011,6 +8725,30 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7108,6 +8846,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7205,6 +8967,30 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7302,6 +9088,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7399,6 +9209,30 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7496,6 +9330,30 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7593,6 +9451,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7690,6 +9572,30 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7787,6 +9693,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7884,6 +9814,30 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 4efd46d271600..8e0770e9007d8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @local_system_unordered_load(
; GFX6-LABEL: local_system_unordered_load:
@@ -88,6 +90,28 @@ define amdgpu_kernel void @local_system_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in unordered, align 4
@@ -176,6 +200,28 @@ define amdgpu_kernel void @local_system_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in monotonic, align 4
@@ -266,6 +312,29 @@ define amdgpu_kernel void @local_system_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in acquire, align 4
@@ -364,6 +433,31 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in seq_cst, align 4
@@ -438,6 +532,24 @@ define amdgpu_kernel void @local_system_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out unordered, align 4
@@ -511,6 +623,24 @@ define amdgpu_kernel void @local_system_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out monotonic, align 4
@@ -592,6 +722,26 @@ define amdgpu_kernel void @local_system_release_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out release, align 4
@@ -673,6 +823,26 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out seq_cst, align 4
@@ -746,6 +916,24 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in monotonic
@@ -827,6 +1015,26 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acquire
@@ -908,6 +1116,26 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in release
@@ -997,6 +1225,28 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acq_rel
@@ -1086,6 +1336,28 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in seq_cst
@@ -1175,6 +1447,29 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acquire
@@ -1273,6 +1568,31 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acq_rel
@@ -1371,6 +1691,31 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in seq_cst
@@ -1452,6 +1797,26 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1541,6 +1906,28 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1630,6 +2017,28 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1727,6 +2136,30 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1824,6 +2257,30 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1913,6 +2370,28 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2002,6 +2481,28 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2099,6 +2600,30 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2196,6 +2721,30 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2293,6 +2842,30 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2390,6 +2963,30 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2487,6 +3084,30 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2584,6 +3205,30 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2681,6 +3326,30 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2778,6 +3447,30 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2873,6 +3566,30 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2972,6 +3689,31 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3077,6 +3819,32 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3184,6 +3952,33 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3291,6 +4086,33 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3390,6 +4212,31 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3489,6 +4336,31 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3596,6 +4468,33 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3703,6 +4602,33 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3810,6 +4736,33 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3917,6 +4870,33 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4024,6 +5004,33 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4131,6 +5138,33 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4238,6 +5272,33 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4345,6 +5406,33 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4435,6 +5523,28 @@ define amdgpu_kernel void @local_system_one_as_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") unordered, align 4
@@ -4523,6 +5633,28 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") monotonic, align 4
@@ -4611,6 +5743,28 @@ define amdgpu_kernel void @local_system_one_as_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") acquire, align 4
@@ -4699,6 +5853,28 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") seq_cst, align 4
@@ -4773,6 +5949,24 @@ define amdgpu_kernel void @local_system_one_as_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") unordered, align 4
@@ -4846,6 +6040,24 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") monotonic, align 4
@@ -4919,6 +6131,24 @@ define amdgpu_kernel void @local_system_one_as_release_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") release, align 4
@@ -4992,6 +6222,24 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") seq_cst, align 4
@@ -5065,6 +6313,24 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") monotonic
@@ -5138,6 +6404,24 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire
@@ -5211,6 +6495,24 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") release
@@ -5284,6 +6586,24 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel
@@ -5357,6 +6677,24 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst
@@ -5444,6 +6782,28 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire
@@ -5532,6 +6892,28 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel
@@ -5620,6 +7002,28 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst
@@ -5701,6 +7105,26 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5782,6 +7206,26 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5863,6 +7307,26 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5944,6 +7408,26 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6025,6 +7509,26 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6106,6 +7610,26 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6187,6 +7711,26 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6268,6 +7812,26 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6349,6 +7913,26 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6430,6 +8014,26 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6511,6 +8115,26 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6592,6 +8216,26 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6673,6 +8317,26 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6754,6 +8418,26 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6835,6 +8519,26 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6930,6 +8634,30 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7027,6 +8755,30 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7124,6 +8876,30 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7221,6 +8997,30 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7318,6 +9118,30 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7415,6 +9239,30 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7512,6 +9360,30 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7609,6 +9481,30 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7706,6 +9602,30 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7803,6 +9723,30 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7900,6 +9844,30 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7997,6 +9965,30 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8094,6 +10086,30 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8191,6 +10207,30 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8288,6 +10328,30 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
index 046325f5437f8..ba58a05328642 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @local_wavefront_unordered_load(
; GFX6-LABEL: local_wavefront_unordered_load:
@@ -88,6 +90,28 @@ define amdgpu_kernel void @local_wavefront_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") unordered, align 4
@@ -176,6 +200,28 @@ define amdgpu_kernel void @local_wavefront_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") monotonic, align 4
@@ -264,6 +310,28 @@ define amdgpu_kernel void @local_wavefront_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") acquire, align 4
@@ -352,6 +420,28 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") seq_cst, align 4
@@ -426,6 +516,24 @@ define amdgpu_kernel void @local_wavefront_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") unordered, align 4
@@ -499,6 +607,24 @@ define amdgpu_kernel void @local_wavefront_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") monotonic, align 4
@@ -572,6 +698,24 @@ define amdgpu_kernel void @local_wavefront_release_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") release, align 4
@@ -645,6 +789,24 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") seq_cst, align 4
@@ -718,6 +880,24 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") monotonic
@@ -791,6 +971,24 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acquire
@@ -864,6 +1062,24 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") release
@@ -937,6 +1153,24 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acq_rel
@@ -1010,6 +1244,24 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") seq_cst
@@ -1097,6 +1349,28 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acquire
@@ -1185,6 +1459,28 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acq_rel
@@ -1273,6 +1569,28 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") seq_cst
@@ -1354,6 +1672,26 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1435,6 +1773,26 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1516,6 +1874,26 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1597,6 +1975,26 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1678,6 +2076,26 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1759,6 +2177,26 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1840,6 +2278,26 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1921,6 +2379,26 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2002,6 +2480,26 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2083,6 +2581,26 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2164,6 +2682,26 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2245,6 +2783,26 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2326,6 +2884,26 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2407,6 +2985,26 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2488,6 +3086,26 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2583,6 +3201,30 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2680,6 +3322,30 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2777,6 +3443,30 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2874,6 +3564,30 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2971,6 +3685,30 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3068,6 +3806,30 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3165,6 +3927,30 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3262,6 +4048,30 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3359,6 +4169,30 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3456,6 +4290,30 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3553,6 +4411,30 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3650,6 +4532,30 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3747,6 +4653,30 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3844,6 +4774,30 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3941,6 +4895,30 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4031,6 +5009,28 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") unordered, align 4
@@ -4119,6 +5119,28 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") monotonic, align 4
@@ -4207,6 +5229,28 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") acquire, align 4
@@ -4295,6 +5339,28 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") seq_cst, align 4
@@ -4369,6 +5435,24 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") unordered, align 4
@@ -4442,6 +5526,24 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") monotonic, align 4
@@ -4515,6 +5617,24 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") release, align 4
@@ -4588,6 +5708,24 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -4661,6 +5799,24 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -4734,6 +5890,24 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -4807,6 +5981,24 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") release
@@ -4880,6 +6072,24 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -4953,6 +6163,24 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -5040,6 +6268,28 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -5128,6 +6378,28 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -5216,6 +6488,28 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -5297,6 +6591,26 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5378,6 +6692,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5459,6 +6793,26 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5540,6 +6894,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5621,6 +6995,26 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5702,6 +7096,26 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5783,6 +7197,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5864,6 +7298,26 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5945,6 +7399,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6026,6 +7500,26 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6107,6 +7601,26 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6188,6 +7702,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6269,6 +7803,26 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6350,6 +7904,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6431,6 +8005,26 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6526,6 +8120,30 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6623,6 +8241,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6720,6 +8362,30 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6817,6 +8483,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6914,6 +8604,30 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7011,6 +8725,30 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7108,6 +8846,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7205,6 +8967,30 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7302,6 +9088,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7399,6 +9209,30 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7496,6 +9330,30 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7593,6 +9451,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7690,6 +9572,30 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7787,6 +9693,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7884,6 +9814,30 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 580d7a814a7dd..0cfa962ec84cc 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @local_workgroup_unordered_load(
; GFX6-LABEL: local_workgroup_unordered_load:
@@ -88,6 +90,28 @@ define amdgpu_kernel void @local_workgroup_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") unordered, align 4
@@ -176,6 +200,28 @@ define amdgpu_kernel void @local_workgroup_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") monotonic, align 4
@@ -266,6 +312,29 @@ define amdgpu_kernel void @local_workgroup_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") acquire, align 4
@@ -364,6 +433,31 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") seq_cst, align 4
@@ -438,6 +532,24 @@ define amdgpu_kernel void @local_workgroup_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") unordered, align 4
@@ -511,6 +623,24 @@ define amdgpu_kernel void @local_workgroup_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") monotonic, align 4
@@ -592,6 +722,26 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") release, align 4
@@ -673,6 +823,26 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") seq_cst, align 4
@@ -746,6 +916,24 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") monotonic
@@ -827,6 +1015,26 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire
@@ -908,6 +1116,26 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") release
@@ -997,6 +1225,28 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel
@@ -1086,6 +1336,28 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst
@@ -1175,6 +1447,29 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire
@@ -1273,6 +1568,31 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel
@@ -1371,6 +1691,31 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst
@@ -1452,6 +1797,26 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1541,6 +1906,28 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1630,6 +2017,28 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1727,6 +2136,30 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1824,6 +2257,30 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1913,6 +2370,28 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2002,6 +2481,28 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2099,6 +2600,30 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2196,6 +2721,30 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2293,6 +2842,30 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2390,6 +2963,30 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2487,6 +3084,30 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2584,6 +3205,30 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2681,6 +3326,30 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2778,6 +3447,30 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2873,6 +3566,30 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2972,6 +3689,31 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3077,6 +3819,32 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3184,6 +3952,33 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3291,6 +4086,33 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3390,6 +4212,31 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3489,6 +4336,31 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3596,6 +4468,33 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3703,6 +4602,33 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3810,6 +4736,33 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3917,6 +4870,33 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4024,6 +5004,33 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4131,6 +5138,33 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4238,6 +5272,33 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4345,6 +5406,33 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: buffer_inv sc0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4435,6 +5523,28 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_unordered_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") unordered, align 4
@@ -4523,6 +5633,28 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") monotonic, align 4
@@ -4611,6 +5743,28 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") acquire, align 4
@@ -4699,6 +5853,28 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -4773,6 +5949,24 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_unordered_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") unordered, align 4
@@ -4846,6 +6040,24 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") monotonic, align 4
@@ -4919,6 +6131,24 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") release, align 4
@@ -4992,6 +6222,24 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -5065,6 +6313,24 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -5138,6 +6404,24 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -5211,6 +6495,24 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") release
@@ -5284,6 +6586,24 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -5357,6 +6677,24 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -5444,6 +6782,28 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -5532,6 +6892,28 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -5620,6 +7002,28 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
+; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -5701,6 +7105,26 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5782,6 +7206,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5863,6 +7307,26 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5944,6 +7408,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6025,6 +7509,26 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6106,6 +7610,26 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6187,6 +7711,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6268,6 +7812,26 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6349,6 +7913,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6430,6 +8014,26 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6511,6 +8115,26 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6592,6 +8216,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6673,6 +8317,26 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6754,6 +8418,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6835,6 +8519,26 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6930,6 +8634,30 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7027,6 +8755,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7124,6 +8876,30 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7221,6 +8997,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7318,6 +9118,30 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7415,6 +9239,30 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7512,6 +9360,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7609,6 +9481,30 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7706,6 +9602,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7803,6 +9723,30 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7900,6 +9844,30 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7997,6 +9965,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8094,6 +10086,30 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8191,6 +10207,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8288,6 +10328,30 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 0fd1e7a04e4b5..b511b98ac2551 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
define amdgpu_kernel void @private_nontemporal_load_0(
; GFX6-LABEL: private_nontemporal_load_0:
@@ -125,6 +127,28 @@ define amdgpu_kernel void @private_nontemporal_load_0(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s4 nt
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: private_nontemporal_load_0:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, off, s4 nt
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0
@@ -253,6 +277,30 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
+; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, v0, off nt
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: private_nontemporal_load_1:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
+; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, v0, off nt
+; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
+; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -378,6 +426,28 @@ define amdgpu_kernel void @private_nontemporal_store_0(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s4 nt
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: private_nontemporal_store_0:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
+; GFX940-TGSPLIT-NEXT: scratch_store_dword off, v0, s4 nt
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@@ -504,6 +574,30 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
; GFX90A-TGSPLIT-NEXT: s_endpgm
+;
+; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
+; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
+; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword v0, v1, off nt
+; GFX940-NOTTGSPLIT-NEXT: s_endpgm
+;
+; GFX940-TGSPLIT-LABEL: private_nontemporal_store_1:
+; GFX940-TGSPLIT: ; %bb.0: ; %entry
+; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
+; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-TGSPLIT-NEXT: scratch_store_dword v0, v1, off nt
+; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list