[llvm] 924a64a - [AMDGPU] Only emit SCOPE_SYS global_wb (#110636)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Oct 6 22:35:36 PDT 2024
Author: Pierre van Houtryve
Date: 2024-10-07T07:35:31+02:00
New Revision: 924a64a3486f9962c42d4ec253774eb2c586ac33
URL: https://github.com/llvm/llvm-project/commit/924a64a3486f9962c42d4ec253774eb2c586ac33
DIFF: https://github.com/llvm/llvm-project/commit/924a64a3486f9962c42d4ec253774eb2c586ac33.diff
LOG: [AMDGPU] Only emit SCOPE_SYS global_wb (#110636)
global_wb with scopes lower than SCOPE_SYS is unnecessary for
correctness.
I was initially optimistic they would be very cheap no-ops but they can
actually be quite expensive so let's avoid them.
Added:
Modified:
llvm/docs/AMDGPUUsage.rst
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
Removed:
################################################################################
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 70dc2006933421..6ff3272422fe95 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -14241,8 +14241,13 @@ For GFX12:
* ``global_inv`` invalidates caches whose scope is strictly smaller than the
instruction's. The invalidation requests cannot be reordered with pending or
upcoming memory operations.
-* ``global_wb`` additionally ensures that previous memory operation done at
- a lower scope level have reached the ``SCOPE:`` of the ``global_wb``.
+* ``global_wb`` is a writeback operation that additionally ensures previous
+ memory operation done at a lower scope level have reached the ``SCOPE:``
+ of the ``global_wb``.
+
+ * ``global_wb`` can be omitted for scopes other than ``SCOPE_SYS`` in
+ gfx120x.
+
* The vector memory operations access a vector L0 cache. There is a single L0
cache per CU. Each SIMD of a CU accesses the same L0 cache. Therefore, no
special action is required for coherence between the lanes of a single
@@ -14949,19 +14954,7 @@ the instruction in the code sequence that references the table.
store atomic release - singlethread - global 1. buffer/global/ds/flat_store
- wavefront - local
- generic
- store atomic release - workgroup - global 1. ``global_wb scope:SCOPE_SE``
-
- - If CU wavefront execution
- mode, omit.
- - In combination with the waits
- below, ensures that all
- memory operations
- have completed at workgroup
- scope before performing the
- store that is being
- released.
-
- 2. | ``s_wait_bvhcnt 0x0``
+ store atomic release - workgroup - global 1. | ``s_wait_bvhcnt 0x0``
| ``s_wait_samplecnt 0x0``
| ``s_wait_storecnt 0x0``
| ``s_wait_loadcnt 0x0``
@@ -14984,7 +14977,11 @@ the instruction in the code sequence that references the table.
atomicrmw-with-return-value.
- ``s_wait_storecnt 0x0``
must happen after
- ``global_wb``.
+ any preceding
+ global/generic
+ store/store
+ atomic/
+ atomicrmw-no-return-value.
- ``s_wait_dscnt 0x0``
must happen after
any preceding
@@ -15004,19 +15001,7 @@ the instruction in the code sequence that references the table.
- Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx12-scopes-table`.
- store atomic release - workgroup - local 1. ``global_wb scope:SCOPE_SE``
-
- - If CU wavefront execution
- mode or OpenCL, omit.
- - In combination with the waits
- below, ensures that all
- memory operations
- have completed at workgroup
- scope before performing the
- store that is being
- released.
-
- 2. | ``s_wait_bvhcnt 0x0``
+ store atomic release - workgroup - local 1. | ``s_wait_bvhcnt 0x0``
| ``s_wait_samplecnt 0x0``
| ``s_wait_storecnt 0x0``
| ``s_wait_loadcnt 0x0``
@@ -15039,7 +15024,11 @@ the instruction in the code sequence that references the table.
atomicrmw-with-return-value.
- ``s_wait_storecnt 0x0``
must happen after
- ``global_wb``.
+ any preceding
+ global/generic
+ store/store
+ atomic/
+ atomicrmw-no-return-value.
- Must happen before the
following store.
- Ensures that all
@@ -15051,16 +15040,9 @@ the instruction in the code sequence that references the table.
released.
3. ds_store
- store atomic release - agent - global 1. ``global_wb``
+ store atomic release - agent - global 1. ``global_wb scope:SCOPE_SYS``
- system - generic
- - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx12-scopes-table`.
- - In combination with the waits
- below, ensures that all
- memory operations
- have completed at agent or system
- scope before performing the
- store that is being
- released.
+ - If agent scope, omit.
2. | ``s_wait_bvhcnt 0x0``
| ``s_wait_samplecnt 0x0``
@@ -15084,7 +15066,12 @@ the instruction in the code sequence that references the table.
atomicrmw-with-return-value.
- ``s_wait_storecnt 0x0``
must happen after
- ``global_wb``.
+ ``global_wb`` if present, or
+ any preceding
+ global/generic
+ store/store
+ atomic/
+ atomicrmw-no-return-value.
- ``s_wait_dscnt 0x0``
must happen after
any preceding
@@ -15109,20 +15096,8 @@ the instruction in the code sequence that references the table.
atomicrmw release - singlethread - global 1. buffer/global/ds/flat_atomic
- wavefront - local
- generic
- atomicrmw release - workgroup - global 1. ``global_wb scope:SCOPE_SE``
- - generic
- - If CU wavefront execution
- mode, omit.
- - In combination with the waits
- below, ensures that all
- memory operations
- have completed at workgroup
- scope before performing the
- store that is being
- released.
-
- 2. | ``s_wait_bvhcnt 0x0``
- | ``s_wait_samplecnt 0x0``
+ atomicrmw release - workgroup - global 1. | ``s_wait_bvhcnt 0x0``
+ - generic | ``s_wait_samplecnt 0x0``
| ``s_wait_storecnt 0x0``
| ``s_wait_loadcnt 0x0``
| ``s_wait_dscnt 0x0``
@@ -15145,15 +15120,19 @@ the instruction in the code sequence that references the table.
atomic/
atomicrmw-with-return-value.
- ``s_wait_storecnt 0x0``
- must happen after
- ``global_wb``.
+ must happen after
+ any preceding
+ global/generic
+ store/store
+ atomic/
+ atomicrmw-no-return-value.
- ``s_wait_dscnt 0x0``
- must happen after
- any preceding
- local/generic
- load/store/load
- atomic/store
- atomic/atomicrmw.
+ must happen after
+ any preceding
+ local/generic
+ load/store/load
+ atomic/store
+ atomic/atomicrmw.
- Must happen before the
following atomic.
- Ensures that all
@@ -15164,23 +15143,11 @@ the instruction in the code sequence that references the table.
atomicrmw that is
being released.
- 3. buffer/global/flat_atomic
+ 2. buffer/global/flat_atomic
- Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx12-scopes-table`.
- atomicrmw release - workgroup - local 1. ``global_wb scope:SCOPE_SE``
-
- - If CU wavefront execution
- mode or OpenCL, omit.
- - In combination with the waits
- below, ensures that all
- memory operations
- have completed at workgroup
- scope before performing the
- store that is being
- released.
-
- 2. | ``s_wait_bvhcnt 0x0``
+ atomicrmw release - workgroup - local 1. | ``s_wait_bvhcnt 0x0``
| ``s_wait_samplecnt 0x0``
| ``s_wait_storecnt 0x0``
| ``s_wait_loadcnt 0x0``
@@ -15203,7 +15170,11 @@ the instruction in the code sequence that references the table.
atomicrmw-with-return-value.
- ``s_wait_storecnt 0x0``
must happen after
- ``global_wb``.
+ any preceding
+ global/generic
+ store/store
+ atomic/
+ atomicrmw-no-return-value.
- Must happen before the
following atomic.
- Ensures that all
@@ -15214,17 +15185,10 @@ the instruction in the code sequence that references the table.
store that is being
released.
- 3. ds_atomic
- atomicrmw release - agent - global 1. ``global_wb scope:``
+ 2. ds_atomic
+ atomicrmw release - agent - global 1. ``global_wb scope:SCOPE_SYS``
- system - generic
- - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx12-scopes-table`.
- - In combination with the waits
- below, ensures that all
- memory operations
- have completed at agent or system
- scope before performing the
- store that is being
- released.
+ - If agent scope, omit.
2. | ``s_wait_bvhcnt 0x0``
| ``s_wait_samplecnt 0x0``
@@ -15247,7 +15211,12 @@ the instruction in the code sequence that references the table.
atomicrmw-with-return-value.
- ``s_wait_storecnt 0x0``
must happen after
- ``global_wb``
+ ``global_wb`` if present, or
+ any preceding
+ global/generic
+ store/store
+ atomic/
+ atomicrmw-no-return-value.
- ``s_wait_dscnt 0x0``
must happen after
any preceding
@@ -15271,19 +15240,7 @@ the instruction in the code sequence that references the table.
fence release - singlethread *none* *none*
- wavefront
- fence release - workgroup *none* 1. ``global_wb scope:SCOPE_SE``
-
- - If CU wavefront execution
- mode, omit.
- - In combination with the waits
- below, ensures that all
- memory operations
- have completed at workgroup
- scope before performing the
- store that is being
- released.
-
- 2. | ``s_wait_bvhcnt 0x0``
+ fence release - workgroup *none* 1. | ``s_wait_bvhcnt 0x0``
| ``s_wait_samplecnt 0x0``
| ``s_wait_storecnt 0x0``
| ``s_wait_loadcnt 0x0``
@@ -15313,7 +15270,11 @@ the instruction in the code sequence that references the table.
atomicrmw-with-return-value.
- ``s_wait_storecnt 0x0``
must happen after
- ``global_wb``
+ any preceding
+ global/generic
+ store/store
+ atomic/
+ atomicrmw-no-return-value.
- ``s_wait_dscnt 0x0``
must happen after
any preceding
@@ -15339,16 +15300,9 @@ the instruction in the code sequence that references the table.
following
fence-paired-atomic.
- fence release - agent *none* 1. ``global_wb``
+ fence release - agent *none* 1. ``global_wb scope:SCOPE_SYS``
- system
- - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx12-scopes-table`.
- - In combination with the waits
- below, ensures that all
- memory operations
- have completed at agent or system
- scope before performing the
- store that is being
- released.
+ - If agent scope, omit.
2. | ``s_wait_bvhcnt 0x0``
| ``s_wait_samplecnt 0x0``
@@ -15381,7 +15335,12 @@ the instruction in the code sequence that references the table.
atomicrmw-with-return-value.
- ``s_wait_storecnt 0x0``
must happen after
- ``global_wb``
+ ``global_wb`` if present, or
+ any preceding
+ global/generic
+ store/store
+ atomic/
+ atomicrmw-no-return-value.
- ``s_wait_dscnt 0x0``
must happen after
any preceding
@@ -15412,19 +15371,7 @@ the instruction in the code sequence that references the table.
atomicrmw acq_rel - singlethread - global 1. buffer/global/ds/flat_atomic
- wavefront - local
- generic
- atomicrmw acq_rel - workgroup - global 1. ``global_wb scope:SCOPE_SE``
-
- - If CU wavefront execution
- mode, omit.
- - In combination with the waits
- below, ensures that all
- memory operations
- have completed at workgroup
- scope before performing the
- store that is being
- released.
-
- 2. | ``s_wait_bvhcnt 0x0``
+ atomicrmw acq_rel - workgroup - global 1. | ``s_wait_bvhcnt 0x0``
| ``s_wait_samplecnt 0x0``
| ``s_wait_storecnt 0x0``
| ``s_wait_loadcnt 0x0``
@@ -15453,7 +15400,11 @@ the instruction in the code sequence that references the table.
atomicrmw-with-return-value.
- ``s_wait_storecnt 0x0``
must happen after
- ``global_wb``.
+ any preceding
+ global/generic
+ store/store
+ atomic/
+ atomicrmw-no-return-value.
- ``s_wait_dscnt 0x0``
must happen after
any preceding
@@ -15472,13 +15423,13 @@ the instruction in the code sequence that references the table.
atomicrmw that is
being released.
- 3. buffer/global_atomic
+ 2. buffer/global_atomic
- Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx12-scopes-table`.
- If atomic with return, use
``th:TH_ATOMIC_RETURN``.
- 4. | **Atomic with return:**
+ 3. | **Atomic with return:**
| ``s_wait_loadcnt 0x0``
| **Atomic without return:**
| ``s_wait_storecnt 0x0``
@@ -15495,7 +15446,7 @@ the instruction in the code sequence that references the table.
atomicrmw value
being acquired.
- 5. ``global_inv scope:SCOPE_SE``
+ 4. ``global_inv scope:SCOPE_SE``
- If CU wavefront execution
mode, omit.
@@ -15504,19 +15455,7 @@ the instruction in the code sequence that references the table.
loads will not see
stale data.
- atomicrmw acq_rel - workgroup - local 1. ``global_wb scope:SCOPE_SE``
-
- - If CU wavefront execution
- mode or OpenCL, omit.
- - In combination with the waits
- below, ensures that all
- memory operations
- have completed at workgroup
- scope before performing the
- store that is being
- released.
-
- 2. | ``s_wait_bvhcnt 0x0``
+ atomicrmw acq_rel - workgroup - local 1 | ``s_wait_bvhcnt 0x0``
| ``s_wait_samplecnt 0x0``
| ``s_wait_storecnt 0x0``
| ``s_wait_loadcnt 0x0``
@@ -15539,7 +15478,11 @@ the instruction in the code sequence that references the table.
atomicrmw-with-return-value.
- ``s_wait_storecnt 0x0``
must happen after
- ``global_wb``
+ any preceding
+ global/generic
+ store/store
+ atomic/
+ atomicrmw-no-return-value.
- Must happen before
the following
store.
@@ -15551,8 +15494,8 @@ the instruction in the code sequence that references the table.
store that is being
released.
- 3. ds_atomic
- 4. ``s_wait_dscnt 0x0``
+ 2. ds_atomic
+ 3. ``s_wait_dscnt 0x0``
- If OpenCL, omit.
- Must happen before
@@ -15565,7 +15508,7 @@ the instruction in the code sequence that references the table.
atomic value being
acquired.
- 5. ``global_inv scope:SCOPE_SE``
+ 4. ``global_inv scope:SCOPE_SE``
- If CU wavefront execution
mode, omit.
@@ -15575,19 +15518,7 @@ the instruction in the code sequence that references the table.
loads will not see
stale data.
- atomicrmw acq_rel - workgroup - generic 1. ``global_wb scope:SCOPE_SE``
-
- - If CU wavefront execution
- mode or OpenCL, omit.
- - In combination with the waits
- below, ensures that all
- memory operations
- have completed at workgroup
- scope before performing the
- store that is being
- released.
-
- 2. | ``s_wait_bvhcnt 0x0``
+ atomicrmw acq_rel - workgroup - generic 1. | ``s_wait_bvhcnt 0x0``
| ``s_wait_samplecnt 0x0``
| ``s_wait_storecnt 0x0``
| ``s_wait_loadcnt 0x0``
@@ -15610,7 +15541,11 @@ the instruction in the code sequence that references the table.
atomicrmw-with-return-value.
- ``s_wait_storecnt 0x0``
must happen after
- ``global_wb``
+ any preceding
+ global/generic
+ store/store
+ atomic/
+ atomicrmw-no-return-value.
- ``s_wait_dscnt 0x0``
must happen after
any preceding
@@ -15629,13 +15564,13 @@ the instruction in the code sequence that references the table.
atomicrmw that is
being released.
- 3. flat_atomic
+ 2. flat_atomic
- Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx12-scopes-table`.
- If atomic with return,
use ``th:TH_ATOMIC_RETURN``.
- 4. | **Atomic without return:**
+ 3. | **Atomic without return:**
| ``s_wait_dscnt 0x0``
| ``s_wait_storecnt 0x0``
| **Atomic with return:**
@@ -15655,7 +15590,7 @@ the instruction in the code sequence that references the table.
atomic value being
acquired.
- 5. ``global_inv scope:SCOPE_SE``
+ 4. ``global_inv scope:SCOPE_SE``
- If CU wavefront execution
mode, omit.
@@ -15664,16 +15599,9 @@ the instruction in the code sequence that references the table.
loads will not see
stale data.
- atomicrmw acq_rel - agent - global 1. ``global_wb``
+ atomicrmw acq_rel - agent - global 1. ``global_wb scope:SCOPE_SYS``
- system
- - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx12-scopes-table`.
- - In combination with the waits
- below, ensures that all
- memory operations
- have completed at agent or system
- scope before performing the
- store that is being
- released.
+ - If agent scope, omit.
2. | ``s_wait_bvhcnt 0x0``
| ``s_wait_samplecnt 0x0``
@@ -15697,7 +15625,12 @@ the instruction in the code sequence that references the table.
atomicrmw-with-return-value.
- ``s_wait_storecnt 0x0``
must happen after
- ``global_wb``
+ ``global_wb`` if present, or
+ any preceding
+ global/generic
+ store/store
+ atomic/
+ atomicrmw-no-return-value.
- ``s_wait_dscnt 0x0``
must happen after
any preceding
@@ -15749,16 +15682,9 @@ the instruction in the code sequence that references the table.
will not see stale
global data.
- atomicrmw acq_rel - agent - generic 1. ``global_wb``
+ atomicrmw acq_rel - agent - generic 1. ``global_wb scope:SCOPE_SYS``
- system
- - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx12-scopes-table`.
- - In combination with the waits
- below, ensures that all
- memory operations
- have completed at agent or system
- scope before performing the
- store that is being
- released.
+ - If agent scope, omit.
2. | ``s_wait_bvhcnt 0x0``
| ``s_wait_samplecnt 0x0``
@@ -15782,7 +15708,11 @@ the instruction in the code sequence that references the table.
atomicrmw-with-return-value.
- ``s_wait_storecnt 0x0``
must happen after
- ``global_wb``
+ ``global_wb`` if present, or
+ any preceding
+ global/generic
+ store/store atomic/
+ atomicrmw-no-return-value.
- ``s_wait_dscnt 0x0``
must happen after
any preceding
@@ -15841,19 +15771,7 @@ the instruction in the code sequence that references the table.
fence acq_rel - singlethread *none* *none*
- wavefront
- fence acq_rel - workgroup *none* 1. ``global_wb scope:SCOPE_SE``
-
- - If CU wavefront execution
- mode, omit.
- - In combination with the waits
- below, ensures that all
- memory operations
- have completed at workgroup
- scope before performing the
- store that is being
- released.
-
- 2. | ``s_wait_bvhcnt 0x0``
+ fence acq_rel - workgroup *none* 1. | ``s_wait_bvhcnt 0x0``
| ``s_wait_samplecnt 0x0``
| ``s_wait_storecnt 0x0``
| ``s_wait_loadcnt 0x0``
@@ -15887,7 +15805,10 @@ the instruction in the code sequence that references the table.
atomicrmw-with-return-value.
- ``s_wait_storecnt 0x0``
must happen after
- ``global_wb``
+ any preceding
+ global/generic
+ store/store atomic/
+ atomicrmw-no-return-value.
- ``s_wait_dscnt 0x0``
must happen after
any preceding
@@ -15959,7 +15880,7 @@ the instruction in the code sequence that references the table.
the
acquire-fence-paired-atomic.
- 3. ``global_inv scope:SCOPE_SE``
+ 2. ``global_inv scope:SCOPE_SE``
- If CU wavefront execution
mode, omit.
@@ -15968,16 +15889,9 @@ the instruction in the code sequence that references the table.
loads will not see
stale data.
- fence acq_rel - agent *none* 1. ``global_wb``
+ fence acq_rel - agent *none* 1. ``global_wb scope:SCOPE_SYS``
- system
- - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx12-scopes-table`.
- - In combination with the waits
- below, ensures that all
- memory operations
- have completed at agent or system
- scope before performing the
- store that is being
- released.
+ - If agent scope, omit.
2. | ``s_wait_bvhcnt 0x0``
| ``s_wait_samplecnt 0x0``
@@ -16011,7 +15925,11 @@ the instruction in the code sequence that references the table.
atomicrmw-with-return-value.
- ``s_wait_storecnt 0x0``
must happen after
- ``global_wb``
+ ``global_wb`` if present, or
+ any preceding
+ global/generic
+ store/store atomic/
+ atomicrmw-no-return-value.
- ``s_wait_dscnt 0x0``
must happen after
any preceding
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 7c26ae88df3726..be6cff873532b1 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2477,49 +2477,27 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
++MI;
- // GLOBAL_WB is always needed, even for write-through caches, as it
- // additionally ensures all operations have reached the desired cache level.
+ // global_wb is only necessary at system scope for gfx120x targets.
//
- // Note that we can technically skip emission of SCOPE_SE writebacks for
- // gfx120x as L1 is a buffer there (hence forwards all to L2), but we still
- // emit them. The current strategy we use is to favor mirrorring SW semantics
- // in the ISA whenever it is correct, and the performance cost is very low.
- //
- // This makes the memory model easier to understand, maintain, and also
- // reduces the potential for bugs as it is sometimes
diff icult to anticipate
- // all possible scenarios in which the WB will actually be needed.
- bool SkipWB = false;
- AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
+ // Emitting it for lower scopes is a slow no-op, so we omit it
+ // for performance.
switch (Scope) {
case SIAtomicScope::SYSTEM:
- ScopeImm = AMDGPU::CPol::SCOPE_SYS;
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
+ .addImm(AMDGPU::CPol::SCOPE_SYS);
break;
case SIAtomicScope::AGENT:
- ScopeImm = AMDGPU::CPol::SCOPE_DEV;
- break;
case SIAtomicScope::WORKGROUP:
- // In WGP mode the waves of a work-group can be executing on either CU of
- // the WGP. Therefore we need to ensure all operations have reached L1,
- // hence the SCOPE_SE WB.
- // For CU mode, we need operations to reach L0, so the wait is enough -
- // there are no ways for an operation to report completion without reaching
- // at least L0.
- if (ST.isCuModeEnabled())
- SkipWB = true;
- else
- ScopeImm = AMDGPU::CPol::SCOPE_SE;
+ // No WB necessary, but we still have to wait.
break;
case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:
- // No cache to invalidate.
+ // No WB or wait necessary here.
return false;
default:
llvm_unreachable("Unsupported synchronization scope");
}
- if (!SkipWB)
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(ScopeImm);
-
if (Pos == Position::AFTER)
--MI;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index df81b926bceb39..43266554c2d8a6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -18,7 +18,6 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -91,7 +90,6 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -164,7 +162,6 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -241,7 +238,6 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -318,7 +314,6 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -465,7 +460,6 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -617,7 +611,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -774,7 +767,6 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -915,7 +907,6 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1058,7 +1049,6 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1209,7 +1199,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1364,7 +1353,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1507,7 +1495,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1684,7 +1671,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -1865,11 +1851,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -2058,11 +2044,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index 53d9bf0751a1d4..9be4fec5a3b95b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -18,7 +18,6 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -91,7 +90,6 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr, float %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -164,7 +162,6 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -241,7 +238,6 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr, double %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -318,7 +314,6 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -465,7 +460,6 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -617,7 +611,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -774,7 +767,6 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -915,7 +907,6 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1058,7 +1049,6 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1209,7 +1199,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1364,7 +1353,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1507,7 +1495,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1684,7 +1671,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -1865,11 +1851,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -2058,11 +2044,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
index 4c34209752c012..55ff6410c23508 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll
@@ -1257,7 +1257,6 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(ptr addrspace(1) inr
; GFX12-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4095:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_dual_mov_b32 v0, 2 :: v_dual_mov_b32 v1, 0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_add_u32 v0, v1, v0, s[2:3] offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1305,7 +1304,6 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_mov_b32_e32 v2, 2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1347,7 +1345,6 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4095(ptr addrspace(1) %pt
; GFX12-LABEL: mubuf_atomicrmw_vgpr_ptr_offset4095:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v2, 2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1391,7 +1388,6 @@ define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(ptr addrspace(
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
; GFX12-NEXT: v_mov_b32_e32 v2, 2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1444,7 +1440,6 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(ptr addrspace(1) in
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_add_u32 v0, v[0:1], v4, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1489,7 +1484,6 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4095(ptr addrspace(1) inreg
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v2, v0
; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v0, v[1:2], s[2:3] offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1539,7 +1533,6 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_offset4294967296(ptr addrspace(1)
; GFX12-NEXT: s_add_co_ci_u32 s1, s3, 4
; GFX12-NEXT: v_mov_b32_e32 v2, v0
; GFX12-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[3:4], v[1:2], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1582,7 +1575,6 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4095(ptr addrspace(1) %ptr,
; GFX12-LABEL: mubuf_cmpxchg_vgpr_ptr_offset4095:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off offset:16380 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1627,7 +1619,6 @@ define amdgpu_ps float @mubuf_cmpxchg_vgpr_ptr_offset4294967296(ptr addrspace(1)
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 0
; GFX12-NEXT: v_mov_b32_e32 v4, v2
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 4, v1, vcc_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1681,7 +1672,6 @@ define amdgpu_ps float @mubuf_cmpxchg_sgpr_ptr_vgpr_offset(ptr addrspace(1) inre
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index ce608df44dc434..d24eed841a9af1 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -282,7 +282,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mov_b32 s8, s2
; GFX1264-NEXT: s_mov_b32 s9, s3
-; GFX1264-NEXT: global_wb scope:SCOPE_DEV
; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
@@ -321,7 +320,6 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s8, s2
; GFX1232-NEXT: s_mov_b32 s9, s3
-; GFX1232-NEXT: global_wb scope:SCOPE_DEV
; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
@@ -618,7 +616,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: v_mov_b32_e32 v1, s3
; GFX1264-NEXT: s_mov_b32 s8, s6
; GFX1264-NEXT: s_mov_b32 s9, s7
-; GFX1264-NEXT: global_wb scope:SCOPE_DEV
; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
@@ -657,7 +654,6 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: v_mov_b32_e32 v1, s2
; GFX1232-NEXT: s_mov_b32 s8, s6
; GFX1232-NEXT: s_mov_b32 s9, s7
-; GFX1232-NEXT: global_wb scope:SCOPE_DEV
; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
@@ -1049,7 +1045,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3
-; GFX1264_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV
; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0
; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
@@ -1103,7 +1098,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3
-; GFX1232_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV
; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0
; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
@@ -1537,7 +1531,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1264_DPP-NEXT: s_mov_b32 s4, s2
; GFX1264_DPP-NEXT: s_mov_b32 s5, s3
-; GFX1264_DPP-NEXT: global_wb scope:SCOPE_DEV
; GFX1264_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV
@@ -1599,7 +1592,6 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s2
; GFX1232_DPP-NEXT: s_mov_b32 s5, s3
-; GFX1232_DPP-NEXT: global_wb scope:SCOPE_DEV
; GFX1232_DPP-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV
@@ -1907,7 +1899,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mov_b32 s8, s2
; GFX1264-NEXT: s_mov_b32 s9, s3
-; GFX1264-NEXT: global_wb scope:SCOPE_DEV
; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
@@ -1948,7 +1939,6 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s8, s2
; GFX1232-NEXT: s_mov_b32 s9, s3
-; GFX1232-NEXT: global_wb scope:SCOPE_DEV
; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
@@ -2300,7 +2290,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_mov_b32 s10, -1
; GFX1264-NEXT: s_mov_b32 s8, s6
; GFX1264-NEXT: s_mov_b32 s9, s7
-; GFX1264-NEXT: global_wb scope:SCOPE_DEV
; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
@@ -2342,7 +2331,6 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX1232-NEXT: s_mov_b32 s12, s6
; GFX1232-NEXT: s_mov_b32 s13, s7
-; GFX1232-NEXT: global_wb scope:SCOPE_DEV
; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
@@ -2792,7 +2780,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3
-; GFX1264_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV
; GFX1264_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0
; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
@@ -2848,7 +2835,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3
-; GFX1232_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV
; GFX1232_ITERATIVE-NEXT: buffer_atomic_add_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0
; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
@@ -3553,7 +3539,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1264_DPP-NEXT: s_mov_b32 s4, s2
; GFX1264_DPP-NEXT: s_mov_b32 s5, s3
-; GFX1264_DPP-NEXT: global_wb scope:SCOPE_DEV
; GFX1264_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV
@@ -3645,7 +3630,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s2
; GFX1232_DPP-NEXT: s_mov_b32 s5, s3
-; GFX1232_DPP-NEXT: global_wb scope:SCOPE_DEV
; GFX1232_DPP-NEXT: buffer_atomic_add_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV
@@ -3941,7 +3925,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mov_b32 s8, s2
; GFX1264-NEXT: s_mov_b32 s9, s3
-; GFX1264-NEXT: global_wb scope:SCOPE_DEV
; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
@@ -3981,7 +3964,6 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s8, s2
; GFX1232-NEXT: s_mov_b32 s9, s3
-; GFX1232-NEXT: global_wb scope:SCOPE_DEV
; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
@@ -4283,7 +4265,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: v_mov_b32_e32 v1, s3
; GFX1264-NEXT: s_mov_b32 s8, s6
; GFX1264-NEXT: s_mov_b32 s9, s7
-; GFX1264-NEXT: global_wb scope:SCOPE_DEV
; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
@@ -4323,7 +4304,6 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: v_mov_b32_e32 v1, s2
; GFX1232-NEXT: s_mov_b32 s8, s6
; GFX1232-NEXT: s_mov_b32 s9, s7
-; GFX1232-NEXT: global_wb scope:SCOPE_DEV
; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
@@ -4716,7 +4696,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3
-; GFX1264_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV
; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0
; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
@@ -4770,7 +4749,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3
-; GFX1232_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV
; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0
; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
@@ -5204,7 +5182,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1264_DPP-NEXT: s_mov_b32 s4, s2
; GFX1264_DPP-NEXT: s_mov_b32 s5, s3
-; GFX1264_DPP-NEXT: global_wb scope:SCOPE_DEV
; GFX1264_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV
@@ -5266,7 +5243,6 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s2
; GFX1232_DPP-NEXT: s_mov_b32 s5, s3
-; GFX1232_DPP-NEXT: global_wb scope:SCOPE_DEV
; GFX1232_DPP-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV
@@ -5589,7 +5565,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mov_b32 s8, s2
; GFX1264-NEXT: s_mov_b32 s9, s3
-; GFX1264-NEXT: global_wb scope:SCOPE_DEV
; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
@@ -5633,7 +5608,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s8, s2
; GFX1232-NEXT: s_mov_b32 s9, s3
-; GFX1232-NEXT: global_wb scope:SCOPE_DEV
; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
@@ -6001,7 +5975,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_mov_b32 s10, -1
; GFX1264-NEXT: s_mov_b32 s8, s6
; GFX1264-NEXT: s_mov_b32 s9, s7
-; GFX1264-NEXT: global_wb scope:SCOPE_DEV
; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
@@ -6047,7 +6020,6 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX1232-NEXT: s_mov_b32 s12, s6
; GFX1232-NEXT: s_mov_b32 s13, s7
-; GFX1232-NEXT: global_wb scope:SCOPE_DEV
; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
@@ -6501,7 +6473,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s8, s2
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s9, s3
-; GFX1264_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV
; GFX1264_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264_ITERATIVE-NEXT: s_wait_loadcnt 0x0
; GFX1264_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
@@ -6557,7 +6528,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s8, s2
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s9, s3
-; GFX1232_ITERATIVE-NEXT: global_wb scope:SCOPE_DEV
; GFX1232_ITERATIVE-NEXT: buffer_atomic_sub_u64 v[2:3], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232_ITERATIVE-NEXT: s_wait_loadcnt 0x0
; GFX1232_ITERATIVE-NEXT: global_inv scope:SCOPE_DEV
@@ -7262,7 +7232,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1264_DPP-NEXT: s_mov_b32 s4, s2
; GFX1264_DPP-NEXT: s_mov_b32 s5, s3
-; GFX1264_DPP-NEXT: global_wb scope:SCOPE_DEV
; GFX1264_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1264_DPP-NEXT: global_inv scope:SCOPE_DEV
@@ -7354,7 +7323,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: s_wait_kmcnt 0x0
; GFX1232_DPP-NEXT: s_mov_b32 s4, s2
; GFX1232_DPP-NEXT: s_mov_b32 s5, s3
-; GFX1232_DPP-NEXT: global_wb scope:SCOPE_DEV
; GFX1232_DPP-NEXT: buffer_atomic_sub_u64 v[8:9], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232_DPP-NEXT: s_wait_loadcnt 0x0
; GFX1232_DPP-NEXT: global_inv scope:SCOPE_DEV
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 4f0bc512565d13..f5c9b1a79b4764 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -210,7 +210,6 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
; GFX1200-NEXT: s_wait_samplecnt 0x0
; GFX1200-NEXT: s_wait_bvhcnt 0x0
; GFX1200-NEXT: s_wait_kmcnt 0x0
-; GFX1200-NEXT: global_wb scope:SCOPE_SE
; GFX1200-NEXT: s_wait_storecnt 0x0
; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -345,7 +344,6 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
; GFX1200-NEXT: s_wait_samplecnt 0x0
; GFX1200-NEXT: s_wait_bvhcnt 0x0
; GFX1200-NEXT: s_wait_kmcnt 0x0
-; GFX1200-NEXT: global_wb scope:SCOPE_SE
; GFX1200-NEXT: s_wait_storecnt 0x0
; GFX1200-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SE
; GFX1200-NEXT: s_wait_storecnt_dscnt 0x0
@@ -437,7 +435,6 @@ define float @no_unsafe(ptr %addr, float %val) {
; GFX1200-NEXT: s_wait_samplecnt 0x0
; GFX1200-NEXT: s_wait_bvhcnt 0x0
; GFX1200-NEXT: s_wait_kmcnt 0x0
-; GFX1200-NEXT: global_wb scope:SCOPE_SE
; GFX1200-NEXT: s_wait_storecnt 0x0
; GFX1200-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index e195026c13d27a..6486117e014d4e 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -22,7 +22,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -235,7 +234,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -426,7 +424,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -846,7 +843,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1076,7 +1072,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -1300,7 +1295,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1549,7 +1543,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1798,7 +1791,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2061,10 +2053,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -2338,8 +2329,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[2:3], v[4:5], v[0:1]
; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2624,8 +2615,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[11:12], v[13:14], v[5:6]
; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
; GFX12-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1
@@ -3137,10 +3128,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -3433,10 +3423,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -3721,15 +3710,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_add_f16_e32 v1, v1, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f16_e32 v1, v1, v0
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4096,15 +4085,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_add_f16_e32 v1, v1, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f16_e32 v1, v1, v0
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4478,14 +4467,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7
; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_add_f16_e32 v6, v6, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f16_e32 v6, v6, v5
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6
; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
@@ -5109,23 +5098,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5539,23 +5528,23 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5976,22 +5965,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX12-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v4, v5
; GFX12-NEXT: v_mov_b32_e32 v5, v6
; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
@@ -6645,7 +6634,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6917,7 +6905,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -7170,7 +7157,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -7671,7 +7657,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -7960,7 +7945,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -8246,7 +8230,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -8535,7 +8518,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -8825,7 +8807,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -9233,7 +9214,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -9632,7 +9612,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -10294,7 +10273,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -10702,7 +10680,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -11101,7 +11078,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -11509,7 +11485,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -11908,7 +11883,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index c7569a6c155dbe..3253fb08836537 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -22,7 +22,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -222,7 +221,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -418,7 +416,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -813,7 +810,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1095,7 +1091,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1310,11 +1305,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -1528,11 +1523,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1763,10 +1757,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -2174,11 +2167,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -2487,11 +2480,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -2717,17 +2710,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3108,17 +3100,16 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3506,16 +3497,15 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4
; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v10
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v4, v5
; GFX12-NEXT: v_mov_b32_e32 v5, v6
; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
@@ -4153,23 +4143,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4585,23 +4575,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5024,22 +5014,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v10
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v4, v5
; GFX12-NEXT: v_mov_b32_e32 v5, v6
; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5706,11 +5696,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v5, v0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5
; GFX12-NEXT: v_pk_max_num_f16 v4, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6051,10 +6041,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6409,10 +6398,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v8
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v4, v5
; GFX12-NEXT: v_mov_b32_e32 v5, v6
; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
@@ -7012,27 +7000,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1
; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
-; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -7459,11 +7447,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v3 :: v_dual_max_num_f32 v0, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
@@ -7916,11 +7904,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v9 :: v_dual_max_num_f32 v4, v4, v8
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1
; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 9f97d2033bbb54..6ce2f350257c8e 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -22,7 +22,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -222,7 +221,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -418,7 +416,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
@@ -813,7 +810,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1095,7 +1091,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1310,11 +1305,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -1528,11 +1523,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1763,10 +1757,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -2174,11 +2167,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -2487,11 +2480,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -2717,17 +2710,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3108,17 +3100,16 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3506,16 +3497,15 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4
; GFX12-NEXT: v_min_num_f16_e32 v4, v4, v10
-; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v4, v5
; GFX12-NEXT: v_mov_b32_e32 v5, v6
; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
@@ -4153,23 +4143,23 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4585,23 +4575,23 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5024,22 +5014,22 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v10
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v4, v5
; GFX12-NEXT: v_mov_b32_e32 v5, v6
; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
@@ -5706,11 +5696,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v5, v0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5
; GFX12-NEXT: v_pk_min_num_f16 v4, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6051,10 +6041,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6409,10 +6398,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_min_num_f16 v5, v4, v8
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v4, v5
; GFX12-NEXT: v_mov_b32_e32 v5, v6
; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
@@ -7012,27 +7000,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
; GFX12-NEXT: v_min_num_f32_e32 v1, v1, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1
; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
-; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -7459,11 +7447,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1
; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
@@ -7916,11 +7904,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v9 :: v_dual_min_num_f32 v4, v4, v8
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1
; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4
; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 1ae1204e3cde18..61cac642d19e8d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -20,7 +20,6 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amd
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -197,7 +196,6 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -384,7 +382,6 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -579,7 +576,6 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -786,7 +782,6 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1004,7 +999,6 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1636,7 +1630,6 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1801,7 +1794,6 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1953,7 +1945,6 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2171,7 +2162,6 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2340,7 +2330,6 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2517,7 +2506,6 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2704,7 +2692,6 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2899,7 +2886,6 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -3106,7 +3092,6 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -3324,7 +3309,6 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -4365,7 +4349,6 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4530,7 +4513,6 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_i
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -4688,7 +4670,6 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4853,7 +4834,6 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -5011,7 +4991,6 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5188,7 +5167,6 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -5395,7 +5373,6 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5544,7 +5521,6 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -5701,7 +5677,6 @@ define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5884,7 +5859,6 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -6068,7 +6042,6 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -6262,7 +6235,6 @@ define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -6433,7 +6405,6 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -6611,7 +6582,6 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -6822,7 +6792,6 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -7113,7 +7082,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -7413,7 +7381,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -7711,7 +7678,6 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -7991,7 +7957,6 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -8280,7 +8245,6 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -8558,7 +8522,6 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -8776,7 +8739,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -9616,7 +9578,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -9961,7 +9922,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -10316,7 +10276,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -10670,7 +10629,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11014,7 +10972,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11349,7 +11306,6 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11633,7 +11589,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11921,7 +11876,6 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12932,7 +12886,6 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -13120,7 +13073,6 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -13311,7 +13263,6 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -13516,7 +13467,6 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -13696,7 +13646,6 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -13883,7 +13832,6 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -14467,7 +14415,6 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -14655,7 +14602,6 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %pt
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -14835,7 +14781,6 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -15023,7 +14968,6 @@ define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -15207,7 +15151,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -15483,7 +15426,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -15762,7 +15704,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -16055,7 +15996,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -16323,7 +16263,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -16598,7 +16537,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -17446,7 +17384,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -17722,7 +17659,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -17990,7 +17926,6 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -18266,7 +18201,6 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index ed78f4a071e3d0..ad5498723940d7 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -20,7 +20,6 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -163,7 +162,6 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -312,7 +310,6 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -476,7 +473,6 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -617,7 +613,6 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -765,7 +760,6 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1230,7 +1224,6 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1423,7 +1416,6 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__amd
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1570,7 +1562,6 @@ define float @flat_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memory
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1713,7 +1704,6 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1862,7 +1852,6 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2026,7 +2015,6 @@ define void @flat_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_memor
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2167,7 +2155,6 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2315,7 +2302,6 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2794,7 +2780,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2950,7 +2935,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3111,7 +3095,6 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3285,7 +3268,6 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3436,7 +3418,6 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3594,7 +3575,6 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3767,7 +3747,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3979,7 +3958,6 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4153,7 +4131,6 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4458,7 +4435,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4772,7 +4748,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5085,7 +5060,6 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5381,7 +5355,6 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5686,7 +5659,6 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5981,7 +5953,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -6218,7 +6189,6 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -7095,7 +7065,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -7441,7 +7410,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -7797,7 +7765,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -8150,7 +8117,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -8485,7 +8451,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -8830,7 +8795,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -9166,7 +9130,6 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -9451,7 +9414,6 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -10431,7 +10393,6 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -10668,7 +10629,6 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -10908,7 +10868,6 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11165,7 +11124,6 @@ define void @flat_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11393,7 +11351,6 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11628,7 +11585,6 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12383,7 +12339,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12730,7 +12685,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -13080,7 +13034,6 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -13446,7 +13399,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -13782,7 +13734,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -14125,7 +14076,6 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index bdb945a652eb21..dbf2626ec4d4f0 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -20,7 +20,6 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -163,7 +162,6 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -312,7 +310,6 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -476,7 +473,6 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -617,7 +613,6 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -765,7 +760,6 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1230,7 +1224,6 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1423,7 +1416,6 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amd
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1570,7 +1562,6 @@ define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1713,7 +1704,6 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1862,7 +1852,6 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2026,7 +2015,6 @@ define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memor
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2167,7 +2155,6 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2315,7 +2302,6 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2794,7 +2780,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2950,7 +2935,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3111,7 +3095,6 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3285,7 +3268,6 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3436,7 +3418,6 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3594,7 +3575,6 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3767,7 +3747,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3979,7 +3958,6 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4153,7 +4131,6 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4458,7 +4435,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4772,7 +4748,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5085,7 +5060,6 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5381,7 +5355,6 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5686,7 +5659,6 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5981,7 +5953,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -6218,7 +6189,6 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -7095,7 +7065,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -7441,7 +7410,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -7797,7 +7765,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -8150,7 +8117,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -8485,7 +8451,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -8830,7 +8795,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -9166,7 +9130,6 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -9451,7 +9414,6 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -10431,7 +10393,6 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -10668,7 +10629,6 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -10908,7 +10868,6 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11165,7 +11124,6 @@ define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11393,7 +11351,6 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11628,7 +11585,6 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12383,7 +12339,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_m
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12730,7 +12685,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -13080,7 +13034,6 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -13446,7 +13399,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -13782,7 +13734,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -14125,7 +14076,6 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index c7f2bf6d1b317f..9cc4f3987b320e 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -28,7 +28,6 @@ define float @flat_agent_atomic_fsub_ret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -225,7 +224,6 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -426,7 +424,6 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -643,7 +640,6 @@ define void @flat_agent_atomic_fsub_noret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -830,7 +826,6 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1024,7 +1019,6 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1641,7 +1635,6 @@ define float @flat_agent_atomic_fsub_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1838,7 +1831,6 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2039,7 +2031,6 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2256,7 +2247,6 @@ define void @flat_agent_atomic_fsub_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2443,7 +2433,6 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2637,7 +2626,6 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3254,7 +3242,6 @@ define double @flat_agent_atomic_fsub_ret_f64(ptr %ptr, double %val) #0 {
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3467,7 +3454,6 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3681,7 +3667,6 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3908,7 +3893,6 @@ define void @flat_agent_atomic_fsub_noret_f64(ptr %ptr, double %val) #0 {
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4105,7 +4089,6 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4309,7 +4292,6 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4551,7 +4533,6 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 {
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4842,7 +4823,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5142,7 +5122,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5440,7 +5419,6 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5720,7 +5698,6 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -6009,7 +5986,6 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -6289,7 +6265,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -6511,7 +6486,6 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -7345,7 +7319,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -7690,7 +7663,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -8045,7 +8017,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -8397,7 +8368,6 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -8731,7 +8701,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -9075,7 +9044,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -9410,7 +9378,6 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr,
; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -9694,7 +9661,6 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -10669,7 +10635,6 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -10889,7 +10854,6 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11112,7 +11076,6 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11351,7 +11314,6 @@ define void @flat_agent_atomic_fsub_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11560,7 +11522,6 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11776,7 +11737,6 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12479,7 +12439,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12826,7 +12785,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -13176,7 +13134,6 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -13542,7 +13499,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -13878,7 +13834,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -14221,7 +14176,6 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 86e6224d2f8d56..eded1ee04625b4 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -40,7 +40,6 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -98,7 +97,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -162,7 +160,6 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -226,7 +223,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -274,7 +270,6 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -327,7 +322,6 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -386,7 +380,6 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -445,7 +438,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -496,7 +488,6 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -554,7 +545,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -618,7 +608,6 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -682,7 +671,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -730,7 +718,6 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -783,7 +770,6 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -842,7 +828,6 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -901,7 +886,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -952,7 +936,6 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1010,7 +993,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1074,7 +1056,6 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1138,7 +1119,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1186,7 +1166,6 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1239,7 +1218,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1298,7 +1276,6 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1357,7 +1334,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1406,7 +1382,6 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1464,7 +1439,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1526,7 +1500,6 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1590,7 +1563,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1636,7 +1608,6 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1689,7 +1660,6 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1746,7 +1716,6 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1805,7 +1774,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1854,7 +1822,6 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1912,7 +1879,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1974,7 +1940,6 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2038,7 +2003,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2084,7 +2048,6 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2137,7 +2100,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2194,7 +2156,6 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2253,7 +2214,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2302,7 +2262,6 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2360,7 +2319,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2422,7 +2380,6 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2486,7 +2443,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2532,7 +2488,6 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2585,7 +2540,6 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2642,7 +2596,6 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2701,7 +2654,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2750,7 +2702,6 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2808,7 +2759,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2870,7 +2820,6 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2934,7 +2883,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2980,7 +2928,6 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3033,7 +2980,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3090,7 +3036,6 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3149,7 +3094,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3200,7 +3144,6 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3258,7 +3201,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3322,7 +3264,6 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3386,7 +3327,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3434,7 +3374,6 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3487,7 +3426,6 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3546,7 +3484,6 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3605,7 +3542,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3656,7 +3592,6 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3704,7 +3639,6 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3752,7 +3686,6 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3810,7 +3743,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3874,7 +3806,6 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3938,7 +3869,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3986,7 +3916,6 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4039,7 +3968,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4098,7 +4026,6 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4157,7 +4084,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4208,7 +4134,6 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4266,7 +4191,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4330,7 +4254,6 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4394,7 +4317,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4442,7 +4364,6 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4495,7 +4416,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4554,7 +4474,6 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4613,7 +4532,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5081,7 +4999,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5138,7 +5055,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5197,7 +5113,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6
; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5262,7 +5177,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5334,7 +5248,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3]
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5392,7 +5305,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5446,7 +5358,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in,
; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5506,7 +5417,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5573,7 +5483,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3]
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6033,7 +5942,6 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6091,7 +5999,6 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6155,7 +6062,6 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6219,7 +6125,6 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6267,7 +6172,6 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6320,7 +6224,6 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6379,7 +6282,6 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6438,7 +6340,6 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6489,7 +6390,6 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6547,7 +6447,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6611,7 +6510,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6675,7 +6573,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6723,7 +6620,6 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6776,7 +6672,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6835,7 +6730,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6894,7 +6788,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 88a95937b9c906..58a6c2ab4bf030 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -105,7 +105,6 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_pk_add_f16 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -129,7 +128,6 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -155,7 +153,6 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_pk_add_bf16 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -179,7 +176,6 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 361cc1e9e6c1db..059f925ee99a42 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -21,7 +21,6 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -206,7 +205,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -393,7 +391,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -590,7 +587,6 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -769,7 +765,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -951,7 +946,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -1514,7 +1508,6 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1717,7 +1710,6 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1920,7 +1912,6 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -2116,7 +2107,6 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2303,7 +2293,6 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2476,7 +2465,6 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2679,7 +2667,6 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -2875,7 +2862,6 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -3057,7 +3043,6 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -3213,7 +3198,6 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -3409,7 +3393,6 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3610,7 +3593,6 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -3803,7 +3785,6 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4004,7 +3985,6 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -4197,7 +4177,6 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4382,7 +4361,6 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -4565,7 +4543,6 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4736,7 +4713,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4909,7 +4885,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5092,7 +5067,6 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -5245,7 +5219,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -5401,7 +5374,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -5898,7 +5870,6 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6071,7 +6042,6 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -6227,7 +6197,6 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6428,7 +6397,6 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -6621,7 +6589,6 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6792,7 +6759,6 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -6957,7 +6923,6 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -7180,7 +7145,6 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -7404,7 +7368,6 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -7634,7 +7597,6 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -7839,7 +7801,6 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -8047,7 +8008,6 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -8283,7 +8243,6 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -8624,7 +8583,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -8976,7 +8934,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -9326,7 +9283,6 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -9655,7 +9611,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -9994,7 +9949,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -10324,7 +10278,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -10585,7 +10538,6 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -11558,7 +11510,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -11953,7 +11904,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -12360,7 +12310,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -12764,7 +12713,6 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -13147,7 +13095,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -13541,7 +13488,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -13926,7 +13872,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -14249,7 +14194,6 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -15355,7 +15299,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -15586,7 +15529,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -15819,7 +15761,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -16056,7 +15997,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -16265,7 +16205,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -16477,7 +16416,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -17146,7 +17084,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -17391,7 +17328,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -17626,7 +17562,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -17857,7 +17792,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -18066,7 +18000,6 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -18311,7 +18244,6 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -18550,7 +18482,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -18879,7 +18810,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -19210,7 +19140,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -19545,7 +19474,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -19864,7 +19792,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -20186,7 +20113,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -21173,7 +21099,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -21502,7 +21427,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -21821,7 +21745,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -22150,7 +22073,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -22469,7 +22391,6 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -22798,7 +22719,6 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index 84003a0432f7ef..e2fde562d36b11 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -21,7 +21,6 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -183,7 +182,6 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -347,7 +345,6 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -511,7 +508,6 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -667,7 +663,6 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -826,7 +821,6 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -1312,7 +1306,6 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1543,7 +1536,6 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory__a
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1709,7 +1701,6 @@ define float @global_agent_atomic_fmax_ret_f32__ftz__amdgpu_no_fine_grained_memo
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1871,7 +1862,6 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2035,7 +2025,6 @@ define float @global_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2199,7 +2188,6 @@ define void @global_agent_atomic_fmax_noret_f32__ftz__amdgpu_no_fine_grained_mem
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -2355,7 +2343,6 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -2514,7 +2501,6 @@ define void @global_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_f
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -3014,7 +3000,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3188,7 +3173,6 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3363,7 +3347,6 @@ define double @global_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3537,7 +3520,6 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3701,7 +3683,6 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3868,7 +3849,6 @@ define void @global_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4036,7 +4016,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4286,7 +4265,6 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4478,7 +4456,6 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4833,7 +4810,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5199,7 +5175,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5564,7 +5539,6 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5909,7 +5883,6 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6264,7 +6237,6 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6609,7 +6581,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6885,7 +6856,6 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -7901,7 +7871,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -8298,7 +8267,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -8707,7 +8675,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -9113,7 +9080,6 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -9498,7 +9464,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -9894,7 +9859,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -10281,7 +10245,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -10606,7 +10569,6 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -11728,7 +11690,6 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__amdgpu_no_fine_grained_me
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -12022,7 +11983,6 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -12318,7 +12278,6 @@ define <2 x half> @global_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -12617,7 +12576,6 @@ define void @global_agent_atomic_fmax_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -12900,7 +12858,6 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -13186,7 +13143,6 @@ define void @global_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -14089,7 +14045,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__amdgpu_no_fine_grained
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -14489,7 +14444,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -14891,7 +14845,6 @@ define <2 x bfloat> @global_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -15295,7 +15248,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -15682,7 +15634,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -16072,7 +16023,6 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index 2aad91cd1071fc..903e80b15814fd 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -21,7 +21,6 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -183,7 +182,6 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -347,7 +345,6 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_gr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -511,7 +508,6 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -667,7 +663,6 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -826,7 +821,6 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -1312,7 +1306,6 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr addr
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1543,7 +1536,6 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__a
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1709,7 +1701,6 @@ define float @global_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memo
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1871,7 +1862,6 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2035,7 +2025,6 @@ define float @global_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2199,7 +2188,6 @@ define void @global_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_mem
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -2355,7 +2343,6 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_f
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -2514,7 +2501,6 @@ define void @global_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_f
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -3014,7 +3000,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3188,7 +3173,6 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3363,7 +3347,6 @@ define double @global_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3537,7 +3520,6 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3701,7 +3683,6 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3868,7 +3849,6 @@ define void @global_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4036,7 +4016,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4286,7 +4265,6 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
; GFX12-NEXT: v_min_num_f64_e32 v[4:5], v[4:5], v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4478,7 +4456,6 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4833,7 +4810,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5199,7 +5175,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5564,7 +5539,6 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5909,7 +5883,6 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6264,7 +6237,6 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6609,7 +6581,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6885,7 +6856,6 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -7901,7 +7871,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -8298,7 +8267,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -8707,7 +8675,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -9113,7 +9080,6 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -9498,7 +9464,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -9894,7 +9859,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -10281,7 +10245,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -10606,7 +10569,6 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -11728,7 +11690,6 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_me
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -12022,7 +11983,6 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -12318,7 +12278,6 @@ define <2 x half> @global_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4
; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -12617,7 +12576,6 @@ define void @global_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -12900,7 +12858,6 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -13186,7 +13143,6 @@ define void @global_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -14089,7 +14045,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -14489,7 +14444,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -14891,7 +14845,6 @@ define <2 x bfloat> @global_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -15295,7 +15248,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -15682,7 +15634,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -16072,7 +16023,6 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index 2e3799e1714afe..3dbf6477a7cb89 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -29,7 +29,6 @@ define float @global_agent_atomic_fsub_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -262,7 +261,6 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -497,7 +495,6 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -740,7 +737,6 @@ define void @global_agent_atomic_fsub_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -962,7 +958,6 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1187,7 +1182,6 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1890,7 +1884,6 @@ define float @global_agent_atomic_fsub_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2123,7 +2116,6 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2358,7 +2350,6 @@ define float @global_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2601,7 +2592,6 @@ define void @global_agent_atomic_fsub_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2823,7 +2813,6 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3048,7 +3037,6 @@ define void @global_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_f32_e32 v3, v4, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -3751,7 +3739,6 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4004,7 +3991,6 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4258,7 +4244,6 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4518,7 +4503,6 @@ define void @global_agent_atomic_fsub_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4749,7 +4733,6 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -4983,7 +4966,6 @@ define void @global_agent_atomic_fsub_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f64_e64 v[4:5], v[6:7], -v[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5245,7 +5227,6 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5586,7 +5567,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -5938,7 +5918,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6288,7 +6267,6 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6617,7 +6595,6 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -6956,7 +6933,6 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -7286,7 +7262,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -7547,7 +7522,6 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -8520,7 +8494,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -8915,7 +8888,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -9322,7 +9294,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -9726,7 +9697,6 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -10109,7 +10079,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -10503,7 +10472,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -10888,7 +10856,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -11211,7 +11178,6 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -12325,7 +12291,6 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16(ptr addrspace(1) %ptr, <2
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -12602,7 +12567,6 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr addrspa
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -12881,7 +12845,6 @@ define <2 x half> @global_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr addrspa
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -13162,7 +13125,6 @@ define void @global_agent_atomic_fsub_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -13426,7 +13388,6 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -13693,7 +13654,6 @@ define void @global_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr addrspace(1
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -14544,7 +14504,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -14944,7 +14903,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr addr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -15346,7 +15304,6 @@ define <2 x bfloat> @global_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr addr
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -15750,7 +15707,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -16137,7 +16093,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -16527,7 +16482,6 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index 24fd709514b476..ba2d48166b2e45 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -49,7 +49,6 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -121,7 +120,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -200,7 +198,6 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -282,7 +279,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -347,7 +343,6 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -418,7 +413,6 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -494,7 +488,6 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -573,7 +566,6 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -633,7 +625,6 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -705,7 +696,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -784,7 +774,6 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -866,7 +855,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -931,7 +919,6 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1002,7 +989,6 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1078,7 +1064,6 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1157,7 +1142,6 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1217,7 +1201,6 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1289,7 +1272,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1368,7 +1350,6 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1450,7 +1431,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1515,7 +1495,6 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1586,7 +1565,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1662,7 +1640,6 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1741,7 +1718,6 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1795,7 +1771,6 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1864,7 +1839,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1937,7 +1911,6 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2016,7 +1989,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2075,7 +2047,6 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2143,7 +2114,6 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2213,7 +2183,6 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2289,7 +2258,6 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2343,7 +2311,6 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2412,7 +2379,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2485,7 +2451,6 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2564,7 +2529,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2623,7 +2587,6 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2691,7 +2654,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2761,7 +2723,6 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2837,7 +2798,6 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2891,7 +2851,6 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2960,7 +2919,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3033,7 +2991,6 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3112,7 +3069,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3171,7 +3127,6 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3239,7 +3194,6 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3309,7 +3263,6 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3385,7 +3338,6 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3439,7 +3391,6 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3508,7 +3459,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3581,7 +3531,6 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out,
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3660,7 +3609,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3719,7 +3667,6 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3787,7 +3734,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3857,7 +3803,6 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3933,7 +3878,6 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3993,7 +3937,6 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4065,7 +4008,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4144,7 +4086,6 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4226,7 +4167,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4291,7 +4231,6 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4362,7 +4301,6 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4438,7 +4376,6 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4517,7 +4454,6 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4577,7 +4513,6 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4633,7 +4568,6 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4689,7 +4623,6 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4761,7 +4694,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4840,7 +4772,6 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out,
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4922,7 +4853,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4987,7 +4917,6 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5058,7 +4987,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5134,7 +5062,6 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5213,7 +5140,6 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5273,7 +5199,6 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5345,7 +5270,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5424,7 +5348,6 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5506,7 +5429,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5571,7 +5493,6 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5642,7 +5563,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5718,7 +5638,6 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5797,7 +5716,6 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5873,7 +5791,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5947,7 +5864,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:72000 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6021,7 +5937,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out,
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6104,7 +6019,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6198,7 +6112,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6276,7 +6189,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i6
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6349,7 +6261,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6429,7 +6340,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6520,7 +6430,6 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -7336,7 +7245,6 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -7408,7 +7316,6 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -7487,7 +7394,6 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -7544,7 +7450,6 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in)
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -7616,7 +7521,6 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -7695,7 +7599,6 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i
; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
-; GFX12-NEXT: global_wb scope:SCOPE_DEV
; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index 0045082eedb0a3..b21b2adbcba951 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -123,7 +123,6 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -809,7 +808,6 @@ define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
; GFX12-NEXT: s_mul_i32 s0, s0, 5
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_add_u32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1058,7 +1056,6 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: s_mul_i32 s1, s1, 5
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 295ae94902da73..0b3ef62856f540 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -22,7 +22,6 @@ define float @local_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -138,7 +137,6 @@ define float @local_atomic_fadd_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -255,7 +253,6 @@ define void @local_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_add_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -369,7 +366,6 @@ define void @local_atomic_fadd_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_add_f32 v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -497,7 +493,6 @@ define double @local_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e32 v[0:1], 4.0, v[3:4]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4]
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -690,7 +685,6 @@ define double @local_atomic_fadd_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e32 v[0:1], 4.0, v[3:4]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -881,7 +875,6 @@ define void @local_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[3:4], 4.0, v[1:2]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -1065,7 +1058,6 @@ define void @local_atomic_fadd_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[3:4], 4.0, v[1:2]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -1269,7 +1261,6 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v0, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v2, v4, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -1577,7 +1568,6 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -1892,7 +1882,6 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -2188,7 +2177,6 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4
; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -2484,7 +2472,6 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -2721,7 +2708,6 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -2972,7 +2958,6 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -3328,7 +3313,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -3691,7 +3675,6 @@ define void @local_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -4035,7 +4018,6 @@ define void @local_atomic_fadd_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4
; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -4379,7 +4361,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -4671,7 +4652,6 @@ define void @local_atomic_fadd_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -4941,7 +4921,6 @@ define <2 x half> @local_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5165,7 +5144,6 @@ define <2 x half> @local_atomic_fadd_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5389,7 +5367,6 @@ define void @local_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_pk_add_f16 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5604,7 +5581,6 @@ define void @local_atomic_fadd_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_pk_add_f16 v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5825,7 +5801,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -6133,7 +6108,6 @@ define <2 x bfloat> @local_atomic_fadd_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -6442,7 +6416,6 @@ define void @local_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_pk_add_bf16 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -6740,7 +6713,6 @@ define void @local_atomic_fadd_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_pk_add_bf16 v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -7057,7 +7029,6 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: s_lshl_b32 s5, s1, 3
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_add_rtn_f32 v1, v2, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -7078,7 +7049,6 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: s_lshl_b32 s0, s1, 4
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_add_f32 v2, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -7120,7 +7090,6 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: s_cbranch_execz .LBB28_8
; GFX12-NEXT: ; %bb.7:
; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -8769,7 +8738,6 @@ define float @local_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_add_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -8885,7 +8853,6 @@ define void @local_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_add_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
index cc79db1b20af46..d419b0cdfdd1ab 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -22,7 +22,6 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -112,7 +111,6 @@ define float @local_atomic_fmax_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -204,7 +202,6 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -294,7 +291,6 @@ define void @local_atomic_fmax_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_f32 v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -391,7 +387,6 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -489,7 +484,6 @@ define double @local_atomic_fmax_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -589,7 +583,6 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -687,7 +680,6 @@ define void @local_atomic_fmax_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_f64 v0, v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -810,7 +802,6 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -1125,7 +1116,6 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -1447,7 +1437,6 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -1751,7 +1740,6 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -2055,7 +2043,6 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -2300,7 +2287,6 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -2558,7 +2544,6 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -2916,7 +2901,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -3281,7 +3265,6 @@ define void @local_atomic_fmax_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -3627,7 +3610,6 @@ define void @local_atomic_fmax_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4
; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -3973,7 +3955,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -4267,7 +4248,6 @@ define void @local_atomic_fmax_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -4549,7 +4529,6 @@ define <2 x half> @local_atomic_fmax_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -4822,7 +4801,6 @@ define <2 x half> @local_atomic_fmax_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5094,7 +5072,6 @@ define void @local_atomic_fmax_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5357,7 +5334,6 @@ define void @local_atomic_fmax_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v3, v3, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5644,7 +5620,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -6022,7 +5997,6 @@ define <2 x bfloat> @local_atomic_fmax_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -6399,7 +6373,6 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -6764,7 +6737,6 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -7111,7 +7083,6 @@ define float @local_atomic_fmax_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -7201,7 +7172,6 @@ define void @local_atomic_fmax_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_max_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
index 1ffd93e35d8cd9..282947afa409a8 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
@@ -22,7 +22,6 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -112,7 +111,6 @@ define float @local_atomic_fmin_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -204,7 +202,6 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -294,7 +291,6 @@ define void @local_atomic_fmin_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_f32 v0, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -391,7 +387,6 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -489,7 +484,6 @@ define double @local_atomic_fmin_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -589,7 +583,6 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_f64 v0, v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -687,7 +680,6 @@ define void @local_atomic_fmin_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: v_mov_b32_e32 v2, 0x40100000
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_f64 v0, v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -810,7 +802,6 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -1125,7 +1116,6 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -1447,7 +1437,6 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -1751,7 +1740,6 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -2055,7 +2043,6 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -2300,7 +2287,6 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -2558,7 +2544,6 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -2916,7 +2901,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -3281,7 +3265,6 @@ define void @local_atomic_fmin_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -3627,7 +3610,6 @@ define void @local_atomic_fmin_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4
; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -3973,7 +3955,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -4267,7 +4248,6 @@ define void @local_atomic_fmin_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -4549,7 +4529,6 @@ define <2 x half> @local_atomic_fmin_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -4822,7 +4801,6 @@ define <2 x half> @local_atomic_fmin_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3
; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5094,7 +5072,6 @@ define void @local_atomic_fmin_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5357,7 +5334,6 @@ define void @local_atomic_fmin_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: v_pk_max_num_f16 v3, v2, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5644,7 +5620,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -6022,7 +5997,6 @@ define <2 x bfloat> @local_atomic_fmin_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -6399,7 +6373,6 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -6764,7 +6737,6 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -7111,7 +7083,6 @@ define float @local_atomic_fmin_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -7201,7 +7172,6 @@ define void @local_atomic_fmin_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_min_num_f32 v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
index 9bc8bafc34a68f..1b08b64b046b48 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
@@ -29,7 +29,6 @@ define float @local_atomic_fsub_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: v_mov_b32_e32 v2, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -242,7 +241,6 @@ define float @local_atomic_fsub_ret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: v_mov_b32_e32 v2, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -454,7 +452,6 @@ define void @local_atomic_fsub_noret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -656,7 +653,6 @@ define void @local_atomic_fsub_noret_f32__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -867,7 +863,6 @@ define double @local_atomic_fsub_ret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e32 v[0:1], -4.0, v[3:4]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4]
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -1085,7 +1080,6 @@ define double @local_atomic_fsub_ret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f64_e32 v[0:1], -4.0, v[3:4]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -1301,7 +1295,6 @@ define void @local_atomic_fsub_noret_f64(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[3:4], -4.0, v[1:2]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2]
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -1508,7 +1501,6 @@ define void @local_atomic_fsub_noret_f64__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[3:4], -4.0, v[1:2]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2] offset:65528
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -1735,7 +1727,6 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: v_lshlrev_b32_e32 v2, v0, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v2, v4, v3, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v1, v2, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -2043,7 +2034,6 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -2358,7 +2348,6 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -2654,7 +2643,6 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4
; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -2950,7 +2938,6 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -3187,7 +3174,6 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -3438,7 +3424,6 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v0, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -3794,7 +3779,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v1, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, v4, v2, v3
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -4157,7 +4141,6 @@ define void @local_atomic_fsub_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v0, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v4, v2, v3, v4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -4501,7 +4484,6 @@ define void @local_atomic_fsub_noret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v4, v1, v4
; GFX12-NEXT: v_and_or_b32 v4, v3, v2, v4
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -4845,7 +4827,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5137,7 +5118,6 @@ define void @local_atomic_fsub_noret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5415,7 +5395,6 @@ define <2 x half> @local_atomic_fsub_ret_v2f16(ptr addrspace(3) %ptr, <2 x half>
; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5671,7 +5650,6 @@ define <2 x half> @local_atomic_fsub_ret_v2f16__offset(ptr addrspace(3) %ptr, <2
; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_add_f16 v2, v3, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v3 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -5925,7 +5903,6 @@ define void @local_atomic_fsub_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %va
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -6169,7 +6146,6 @@ define void @local_atomic_fsub_noret_v2f16__offset(ptr addrspace(3) %ptr, <2 x h
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_pk_add_f16 v3, v2, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v2 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -6440,7 +6416,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bf
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -6818,7 +6793,6 @@ define <2 x bfloat> @local_atomic_fsub_ret_v2bf16__offset(ptr addrspace(3) %ptr,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0
; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v4 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -7195,7 +7169,6 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat>
; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -7560,7 +7533,6 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b
; GFX12-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_perm_b32 v4, v5, v4, 0x7060302
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 offset:65532
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -7914,7 +7886,6 @@ define float @local_atomic_fsub_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspa
; GFX12-NEXT: v_mov_b32_e32 v2, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_add_f32_e32 v1, -4.0, v2
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -8125,7 +8096,6 @@ define void @local_atomic_fsub_noret_f32__amdgpu_ignore_denormal_mode(ptr addrsp
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: v_add_f32_e32 v2, -4.0, v1
-; GFX12-NEXT: global_wb scope:SCOPE_SE
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1
; GFX12-NEXT: s_wait_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
index b8fa35092baf8a..e1589ccd7350f1 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
@@ -136,7 +136,6 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX12-WGP-LABEL: workgroup_release_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -208,7 +207,6 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -281,7 +279,6 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -420,7 +417,6 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX12-WGP-LABEL: workgroup_one_as_release_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -492,7 +488,6 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -565,7 +560,6 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -737,7 +731,6 @@ define amdgpu_kernel void @agent_release_fence() {
;
; GFX12-WGP-LABEL: agent_release_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -746,7 +739,6 @@ define amdgpu_kernel void @agent_release_fence() {
;
; GFX12-CU-LABEL: agent_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -835,7 +827,6 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
;
; GFX12-WGP-LABEL: agent_acq_rel_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -845,7 +836,6 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
;
; GFX12-CU-LABEL: agent_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -935,7 +925,6 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
;
; GFX12-WGP-LABEL: agent_seq_cst_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -945,7 +934,6 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
;
; GFX12-CU-LABEL: agent_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -1113,7 +1101,6 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
;
; GFX12-WGP-LABEL: agent_one_as_release_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -1122,7 +1109,6 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
;
; GFX12-CU-LABEL: agent_one_as_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -1211,7 +1197,6 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
;
; GFX12-WGP-LABEL: agent_one_as_acq_rel_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -1221,7 +1206,6 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
;
; GFX12-CU-LABEL: agent_one_as_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -1311,7 +1295,6 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
;
; GFX12-WGP-LABEL: agent_one_as_seq_cst_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -1321,7 +1304,6 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
;
; GFX12-CU-LABEL: agent_one_as_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index ea1b8ceb94f11a..ebda33d01a438f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1063,7 +1063,6 @@ define amdgpu_kernel void @workgroup_release_fence() {
;
; GFX12-WGP-LABEL: workgroup_release_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1143,7 +1142,6 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
;
; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1224,7 +1222,6 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
;
; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1364,7 +1361,6 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
;
; GFX12-WGP-LABEL: workgroup_one_as_release_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -1436,7 +1432,6 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
;
; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -1509,7 +1504,6 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
;
; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -1681,7 +1675,6 @@ define amdgpu_kernel void @agent_release_fence() {
;
; GFX12-WGP-LABEL: agent_release_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1690,7 +1683,6 @@ define amdgpu_kernel void @agent_release_fence() {
;
; GFX12-CU-LABEL: agent_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -1779,7 +1771,6 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
;
; GFX12-WGP-LABEL: agent_acq_rel_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1789,7 +1780,6 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
;
; GFX12-CU-LABEL: agent_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -1879,7 +1869,6 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
;
; GFX12-WGP-LABEL: agent_seq_cst_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1889,7 +1878,6 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
;
; GFX12-CU-LABEL: agent_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -2057,7 +2045,6 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
;
; GFX12-WGP-LABEL: agent_one_as_release_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -2066,7 +2053,6 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
;
; GFX12-CU-LABEL: agent_one_as_release_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -2155,7 +2141,6 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
;
; GFX12-WGP-LABEL: agent_one_as_acq_rel_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -2165,7 +2150,6 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
;
; GFX12-CU-LABEL: agent_one_as_acq_rel_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -2255,7 +2239,6 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
;
; GFX12-WGP-LABEL: agent_one_as_seq_cst_fence:
; GFX12-WGP: ; %bb.0: ; %entry
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -2265,7 +2248,6 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
;
; GFX12-CU-LABEL: agent_one_as_seq_cst_fence:
; GFX12-CU: ; %bb.0: ; %entry
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 1c33d8a19890db..23a4cac25d1aa1 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -1233,7 +1233,6 @@ define amdgpu_kernel void @flat_agent_release_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1249,7 +1248,6 @@ define amdgpu_kernel void @flat_agent_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -1399,7 +1397,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1415,7 +1412,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -1916,7 +1912,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1934,7 +1929,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -2123,7 +2117,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2143,7 +2136,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -2334,7 +2326,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2354,7 +2345,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -2755,7 +2745,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2778,7 +2767,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -2985,7 +2973,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3008,7 +2995,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -3749,7 +3735,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3771,7 +3756,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -4035,7 +4019,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4059,7 +4042,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -4325,7 +4307,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4349,7 +4330,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -5143,7 +5123,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5167,7 +5146,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -5433,7 +5411,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5457,7 +5434,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -5723,7 +5699,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5747,7 +5722,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -6013,7 +5987,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6037,7 +6010,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -6303,7 +6275,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6327,7 +6298,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -6593,7 +6563,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6617,7 +6586,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -6883,7 +6851,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6907,7 +6874,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -7173,7 +7139,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -7197,7 +7162,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -8033,7 +7997,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8059,7 +8022,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -8347,7 +8309,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8376,7 +8337,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -8667,7 +8627,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8696,7 +8655,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -9571,7 +9529,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -9600,7 +9557,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -9891,7 +9847,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -9920,7 +9875,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -10211,7 +10165,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -10240,7 +10193,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -10531,7 +10483,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -10560,7 +10511,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -10851,7 +10801,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -10878,7 +10827,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -11167,7 +11115,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -11196,7 +11143,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -11487,7 +11433,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -11516,7 +11461,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -11807,7 +11751,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -11836,7 +11779,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -13100,7 +13042,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -13116,7 +13057,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -13266,7 +13206,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -13282,7 +13221,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -13779,7 +13717,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -13797,7 +13734,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -13982,7 +13918,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -14002,7 +13937,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -14189,7 +14123,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -14209,7 +14142,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -14628,7 +14560,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -14652,7 +14583,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -14868,7 +14798,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -14892,7 +14821,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -15630,7 +15558,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -15652,7 +15579,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -15912,7 +15838,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -15936,7 +15861,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -16198,7 +16122,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -16222,7 +16145,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -17004,7 +16926,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -17028,7 +16949,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -17290,7 +17210,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -17314,7 +17233,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -17576,7 +17494,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -17600,7 +17517,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -17862,7 +17778,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -17886,7 +17801,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -18148,7 +18062,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -18172,7 +18085,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -18434,7 +18346,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -18458,7 +18369,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -18720,7 +18630,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -18744,7 +18653,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -19006,7 +18914,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -19030,7 +18937,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -19876,7 +19782,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -19902,7 +19807,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -20198,7 +20102,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -20228,7 +20131,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -20528,7 +20430,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -20558,7 +20459,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -21462,7 +21362,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -21492,7 +21391,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -21792,7 +21690,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -21822,7 +21719,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -22122,7 +22018,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -22152,7 +22047,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -22452,7 +22346,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -22482,7 +22375,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -22782,7 +22674,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -22810,7 +22701,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -23108,7 +22998,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -23138,7 +23027,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -23438,7 +23326,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -23468,7 +23355,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -23768,7 +23654,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -23798,7 +23683,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index f1b465c1789da7..ebae2b6152e7bf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -1042,7 +1042,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 5ddabad7374ddd..4a073a771ac0c0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -1216,7 +1216,6 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1374,7 +1373,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1871,7 +1869,6 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2059,7 +2056,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2250,7 +2246,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2656,7 +2651,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2873,7 +2867,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3615,7 +3608,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3882,7 +3874,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4152,7 +4143,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4926,7 +4916,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5196,7 +5185,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5466,7 +5454,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5736,7 +5723,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6584,7 +6570,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6887,7 +6872,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -7194,7 +7178,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8077,7 +8060,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8384,7 +8366,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8691,7 +8672,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8998,7 +8978,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -9305,7 +9284,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -9610,7 +9588,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -9917,7 +9894,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -10224,7 +10200,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -11453,7 +11428,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -11604,7 +11578,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -12085,7 +12058,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -12258,7 +12230,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -12433,7 +12404,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -12823,7 +12793,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -13029,7 +12998,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -13755,7 +13723,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -14007,7 +13974,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -14261,7 +14227,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -15001,7 +14966,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -15255,7 +15219,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -15509,7 +15472,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -15763,7 +15725,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -16017,7 +15978,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -16271,7 +16231,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -16525,7 +16484,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -16779,7 +16737,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -17615,7 +17572,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -17907,7 +17863,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -18203,7 +18158,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -19067,7 +19021,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -19363,7 +19316,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -19659,7 +19611,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -19955,7 +19906,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -20251,7 +20201,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -20545,7 +20494,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -20841,7 +20789,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -21137,7 +21084,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 8a02ad5dfdb7b0..ddfc232bdf55b3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -1360,7 +1360,6 @@ define amdgpu_kernel void @global_agent_release_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1377,7 +1376,6 @@ define amdgpu_kernel void @global_agent_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -1556,7 +1554,6 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1573,7 +1570,6 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -2088,7 +2084,6 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2103,7 +2098,6 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -2298,7 +2292,6 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2315,7 +2308,6 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -2512,7 +2504,6 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2529,7 +2520,6 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -2940,7 +2930,6 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2960,7 +2949,6 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -3173,7 +3161,6 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3193,7 +3180,6 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -3911,7 +3897,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3932,7 +3917,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -4189,7 +4173,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4212,7 +4195,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -4471,7 +4453,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4494,7 +4475,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -5263,7 +5243,6 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5286,7 +5265,6 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -5545,7 +5523,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5568,7 +5545,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -5827,7 +5803,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5850,7 +5825,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -6109,7 +6083,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6132,7 +6105,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -6391,7 +6363,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6414,7 +6385,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -6673,7 +6643,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6696,7 +6665,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -6955,7 +6923,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6978,7 +6945,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -7237,7 +7203,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -7260,7 +7225,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -8050,7 +8014,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8073,7 +8036,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -8349,7 +8311,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8375,7 +8336,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -8654,7 +8614,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8680,7 +8639,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -9511,7 +9469,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -9537,7 +9494,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -9816,7 +9772,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -9842,7 +9797,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -10121,7 +10075,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -10147,7 +10100,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -10426,7 +10378,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -10452,7 +10403,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -10731,7 +10681,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -10755,7 +10704,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -11032,7 +10980,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -11058,7 +11005,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -11337,7 +11283,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -11363,7 +11308,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -11642,7 +11586,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -11668,7 +11611,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
@@ -13036,7 +12978,6 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -13053,7 +12994,6 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -13232,7 +13172,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -13249,7 +13188,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -13764,7 +13702,6 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -13779,7 +13716,6 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -13974,7 +13910,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -13991,7 +13926,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -14188,7 +14122,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -14205,7 +14138,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -14616,7 +14548,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -14636,7 +14567,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -14849,7 +14779,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -14869,7 +14798,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -15587,7 +15515,6 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -15608,7 +15535,6 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -15865,7 +15791,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -15888,7 +15813,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -16147,7 +16071,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -16170,7 +16093,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -16939,7 +16861,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -16962,7 +16883,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -17221,7 +17141,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -17244,7 +17163,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -17503,7 +17421,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -17526,7 +17443,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -17785,7 +17701,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -17808,7 +17723,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -18067,7 +17981,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -18090,7 +18003,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -18349,7 +18261,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -18372,7 +18283,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -18631,7 +18541,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -18654,7 +18563,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -18913,7 +18821,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -18936,7 +18843,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -19741,7 +19647,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -19767,7 +19672,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -20046,7 +19950,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -20072,7 +19975,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -20903,7 +20805,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -20929,7 +20830,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -21208,7 +21108,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -21234,7 +21133,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -21513,7 +21411,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -21539,7 +21436,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -21818,7 +21714,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -21844,7 +21739,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -22123,7 +22017,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -22147,7 +22040,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -22424,7 +22316,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -22450,7 +22341,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -22729,7 +22619,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -22755,7 +22644,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
@@ -23034,7 +22922,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_DEV
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -23060,7 +22947,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT: global_wb scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index d916ff533e77bd..29d57f9ceaa4c6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -989,7 +989,6 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 26511f079fa8f8..4a5d215bcede68 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -1327,7 +1327,6 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1515,7 +1514,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2020,7 +2018,6 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2205,7 +2202,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2392,7 +2388,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2789,7 +2784,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3001,7 +2995,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3709,7 +3702,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3962,7 +3954,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4217,7 +4208,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4944,7 +4934,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5199,7 +5188,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5454,7 +5442,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5709,7 +5696,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5964,7 +5950,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6219,7 +6204,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6474,7 +6458,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6729,7 +6712,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -7520,7 +7502,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -7800,7 +7781,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8083,7 +8063,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8892,7 +8871,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -9175,7 +9153,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -9458,7 +9435,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -9741,7 +9717,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -10024,7 +9999,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -10305,7 +10279,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -10588,7 +10561,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -10871,7 +10843,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -12214,7 +12185,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -12394,7 +12364,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -12891,7 +12860,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -13068,7 +13036,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -13247,7 +13214,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -13636,7 +13602,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -13840,7 +13805,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -14540,7 +14504,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -14785,7 +14748,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -15032,7 +14994,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -15751,7 +15712,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -15998,7 +15958,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -16245,7 +16204,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -16492,7 +16450,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -16739,7 +16696,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -16986,7 +16942,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -17233,7 +17188,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -17480,7 +17434,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -18263,7 +18216,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -18535,7 +18487,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -18810,7 +18761,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -19611,7 +19561,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -19886,7 +19835,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -20161,7 +20109,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -20436,7 +20383,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -20711,7 +20657,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -20984,7 +20929,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -21259,7 +21203,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
@@ -21534,7 +21477,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index d925ca52f85600..b4a95d23788a9a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -1140,7 +1140,6 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1299,7 +1298,6 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1754,7 +1752,6 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1926,7 +1923,6 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2101,7 +2097,6 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2491,7 +2486,6 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2698,7 +2692,6 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3282,7 +3275,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3496,7 +3488,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3713,7 +3704,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4326,7 +4316,6 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4543,7 +4532,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4760,7 +4748,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4977,7 +4964,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5194,7 +5180,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5411,7 +5396,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5628,7 +5612,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5845,7 +5828,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6533,7 +6515,6 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6779,7 +6760,6 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -7026,7 +7006,6 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -7729,7 +7708,6 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -7976,7 +7954,6 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8223,7 +8200,6 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8470,7 +8446,6 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8717,7 +8692,6 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8964,7 +8938,6 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -9211,7 +9184,6 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -9458,7 +9430,6 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 548c5aceb25f74..3a337bc74282a6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -1140,7 +1140,6 @@ define amdgpu_kernel void @local_system_release_store(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1299,7 +1298,6 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1754,7 +1752,6 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1926,7 +1923,6 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2101,7 +2097,6 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2491,7 +2486,6 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2698,7 +2692,6 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3282,7 +3275,6 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3496,7 +3488,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3713,7 +3704,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4326,7 +4316,6 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4543,7 +4532,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4760,7 +4748,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4977,7 +4964,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5194,7 +5180,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5411,7 +5396,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5628,7 +5612,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5845,7 +5828,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6533,7 +6515,6 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6779,7 +6760,6 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -7026,7 +7006,6 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -7729,7 +7708,6 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -7976,7 +7954,6 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8223,7 +8200,6 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8470,7 +8446,6 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8717,7 +8692,6 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8964,7 +8938,6 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -9211,7 +9184,6 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -9458,7 +9430,6 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index a8f7051bd5050c..4439f9ef818a97 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -848,7 +848,6 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 0cf644c006facd..af6033c844209d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -1140,7 +1140,6 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1299,7 +1298,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1754,7 +1752,6 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -1926,7 +1923,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2101,7 +2097,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2491,7 +2486,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -2698,7 +2692,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3282,7 +3275,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3496,7 +3488,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -3713,7 +3704,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4326,7 +4316,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4543,7 +4532,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4760,7 +4748,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -4977,7 +4964,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5194,7 +5180,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5411,7 +5396,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5628,7 +5612,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -5845,7 +5828,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6533,7 +6515,6 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -6779,7 +6760,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -7026,7 +7006,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -7729,7 +7708,6 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -7976,7 +7954,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8223,7 +8200,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8470,7 +8446,6 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8717,7 +8692,6 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -8964,7 +8938,6 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -9211,7 +9184,6 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
@@ -9458,7 +9430,6 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s1
-; GFX12-WGP-NEXT: global_wb scope:SCOPE_SE
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
More information about the llvm-commits
mailing list