[llvm] [AMDGPU][gfx1250] Implement SIMemoryLegalizer (PR #154726)
Pierre van Houtryve via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 9 01:29:43 PDT 2025
https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/154726
>From 287380010f8d75ba08a4db09d7fc5b481d4bf87b Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 21 Aug 2025 12:14:21 +0200
Subject: [PATCH 1/5] [AMDGPU][gfx1250] Implement SIMemoryLegalizer
Implements the base of the MemoryLegalizer for a roughly correct GFX1250 memory model.
Documentation will come later, and some remaining changes still have to be added, but this is the backbone of the model.
---
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 +
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 73 +-
llvm/lib/Target/AMDGPU/SOPInstructions.td | 5 +
.../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 22 +
.../CodeGen/AMDGPU/atomics-system-scope.ll | 8 +
.../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 292 +-
.../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 25 +
.../memory-legalizer-fence-mmra-global.ll | 242 +-
.../CodeGen/AMDGPU/memory-legalizer-fence.ll | 246 +-
.../AMDGPU/memory-legalizer-flat-agent.ll | 2964 ++++++++--------
.../AMDGPU/memory-legalizer-flat-lastuse.ll | 22 +-
.../memory-legalizer-flat-nontemporal.ll | 22 +-
.../memory-legalizer-flat-singlethread.ll | 2220 ++++++------
.../AMDGPU/memory-legalizer-flat-system.ll | 3022 ++++++++---------
.../AMDGPU/memory-legalizer-flat-volatile.ll | 74 +-
.../AMDGPU/memory-legalizer-flat-wavefront.ll | 2189 ++++++------
.../AMDGPU/memory-legalizer-flat-workgroup.ll | 2330 +++++++------
.../AMDGPU/memory-legalizer-global-agent.ll | 2888 ++++++++--------
.../AMDGPU/memory-legalizer-global-lastuse.ll | 22 +-
.../memory-legalizer-global-nontemporal.ll | 22 +-
.../memory-legalizer-global-singlethread.ll | 2220 ++++++------
.../AMDGPU/memory-legalizer-global-system.ll | 2789 ++++++++-------
.../memory-legalizer-global-volatile.ll | 74 +-
.../memory-legalizer-global-wavefront.ll | 2220 ++++++------
.../memory-legalizer-global-workgroup.ll | 2442 +++++++------
.../AMDGPU/memory-legalizer-local-agent.ll | 824 ++---
.../AMDGPU/memory-legalizer-local-system.ll | 824 ++---
.../AMDGPU/memory-legalizer-local-volatile.ll | 21 +-
.../memory-legalizer-local-workgroup.ll | 824 ++---
30 files changed, 14856 insertions(+), 14076 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4475c8d1d1602..556ec683f2ec6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1835,6 +1835,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasScratchBaseForwardingHazard() const {
return GFX1250Insts && getGeneration() == GFX12;
}
+
+ /// \returns true if the subtarget requires a wait for xcnt before atomic
+ /// flat/global stores & rmw.
+ bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
};
class GCNUserSGPRUsageInfo {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index c964d02ee2b97..f7dde2b90b68e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1055,6 +1055,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return AMDGPU::S_WAIT_DSCNT;
case AMDGPU::S_WAIT_KMCNT_soft:
return AMDGPU::S_WAIT_KMCNT;
+ case AMDGPU::S_WAIT_XCNT_soft:
+ return AMDGPU::S_WAIT_XCNT;
default:
return Opcode;
}
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 6ab4eb4bde97c..95fa03fc97e69 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -606,7 +606,11 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
public:
- SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
+ SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
+ // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
+ // the behavior is the same if assuming GFX12.0 in CU mode.
+ assert(ST.hasGFX1250Insts() ? ST.isCuModeEnabled() : true);
+ }
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
@@ -2378,12 +2382,16 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
STORECnt |= true;
break;
case SIAtomicScope::WORKGROUP:
- // In WGP mode the waves of a work-group can be executing on either CU of
- // the WGP. Therefore need to wait for operations to complete to ensure
- // they are visible to waves in the other CU as the L0 is per CU.
- // Otherwise in CU mode and all waves of a work-group are on the same CU
- // which shares the same L0.
- if (!ST.isCuModeEnabled()) {
+ // GFX12.0:
+ // In WGP mode the waves of a work-group can be executing on either CU
+ // of the WGP. Therefore need to wait for operations to complete to
+ // ensure they are visible to waves in the other CU as the L0 is per CU.
+ // Otherwise in CU mode and all waves of a work-group are on the same CU
+ // which shares the same L0.
+ //
+ // GFX12.5:
+ // TODO DOCS
+ if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
LOADCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
@@ -2404,7 +2412,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
- case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WORKGROUP:
// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
// not needed as LDS operations for all waves are executed in a total
// global ordering as observed by all waves. Required if also
@@ -2435,7 +2443,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
//
// This also applies to fences. Fences cannot pair with an instruction
// tracked with bvh/samplecnt as we don't have any atomics that do that.
- if (Order != AtomicOrdering::Acquire) {
+ if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
}
@@ -2487,10 +2495,14 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
ScopeImm = AMDGPU::CPol::SCOPE_DEV;
break;
case SIAtomicScope::WORKGROUP:
- // In WGP mode the waves of a work-group can be executing on either CU of
- // the WGP. Therefore we need to invalidate the L0 which is per CU.
- // Otherwise in CU mode all waves of a work-group are on the same CU, and so
- // the L0 does not need to be invalidated.
+ // GFX12.0:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore we need to invalidate the L0 which is per CU.
+ // Otherwise in CU mode all waves of a work-group are on the same CU, and
+ // so the L0 does not need to be invalidated.
+ //
+ // GFX12.5
+ // TODO DOCS
if (ST.isCuModeEnabled())
return false;
@@ -2535,7 +2547,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
if (Pos == Position::AFTER)
++MI;
- // global_wb is only necessary at system scope for gfx120x targets.
+ // global_wb is only necessary at system scope for GFX12.0,
+ // they're also necessary at device scope for GFX12.5.
//
// Emitting it for lower scopes is a slow no-op, so we omit it
// for performance.
@@ -2545,6 +2558,12 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
.addImm(AMDGPU::CPol::SCOPE_SYS);
break;
case SIAtomicScope::AGENT:
+ // TODO DOCS
+ if (ST.hasGFX1250Insts()) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
+ .addImm(AMDGPU::CPol::SCOPE_DEV);
+ }
+ break;
case SIAtomicScope::WORKGROUP:
// No WB necessary, but we still have to wait.
break;
@@ -2607,17 +2626,31 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
}
bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
- MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
- if (!CPol)
- return false;
+ assert(MI.mayStore() && "Not a Store inst");
+ const bool IsRMW = (MI.mayLoad() && MI.mayStore());
+ bool Changed = false;
+
+ // GFX12.5 only: xcnt wait is needed before flat and global atomics stores/rmw
+ if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
+ Changed = true;
+ }
+
+ // Remaining fixes do not apply to RMWs
+ if (IsRMW)
+ return Changed;
+ MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
+ if (!CPol) // Some vmem operations do not have a scope and are not concerned.
+ return Changed;
const unsigned Scope = CPol->getImm() & CPol::SCOPE;
// GFX12.0 only: Extra waits needed before system scope stores.
if (!ST.hasGFX1250Insts()) {
if (!Atomic && Scope == CPol::SCOPE_SYS)
return insertWaitsBeforeSystemScopeStore(MI);
- return false;
+ return Changed;
}
// GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
@@ -2627,7 +2660,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
(!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI)))
return setScope(MI, CPol::SCOPE_SE);
- return false;
+ return Changed;
}
bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
@@ -2839,6 +2872,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
assert(MI->mayLoad() && MI->mayStore());
bool Changed = false;
+ MachineInstr &RMWMI = *MI;
if (MOI.isAtomic()) {
const AtomicOrdering Order = MOI.getOrdering();
@@ -2873,6 +2907,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
Position::AFTER);
}
+ Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index dbe0b8c496fed..e170268b47c44 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1653,6 +1653,11 @@ let OtherPredicates = [HasImageInsts] in {
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
}
+
+let SubtargetPredicate = HasWaitXcnt in {
+ def S_WAIT_XCNT_soft : SOPP_Pseudo<"s_soft_wait_xcnt", (ins s16imm:$simm16), "$simm16">;
+}
+
// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
// Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 481a2540eacb7..e886ea4fc6ac6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1501,6 +1501,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1571,6 +1572,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1645,6 +1649,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1715,6 +1720,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1792,6 +1800,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
@@ -1902,6 +1911,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1]
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1947,6 +1959,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1987,6 +2000,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2031,6 +2047,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -2107,6 +2124,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 {
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2190,6 +2208,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2418,6 +2439,7 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
index 5fc9f4a0f8038..4bb2a13d02cc7 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
@@ -364,6 +364,7 @@ define i16 @global_one_as_atomic_min_i16(ptr addrspace(1) %ptr, i16 %val) {
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -406,6 +407,7 @@ define i16 @global_one_as_atomic_umin_i16(ptr addrspace(1) %ptr, i16 %val) {
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -448,6 +450,7 @@ define i16 @global_one_as_atomic_max_i16(ptr addrspace(1) %ptr, i16 %val) {
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -490,6 +493,7 @@ define i16 @global_one_as_atomic_umax_i16(ptr addrspace(1) %ptr, i16 %val) {
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1344,6 +1348,7 @@ define i16 @flat_one_as_atomic_min_i16(ptr %ptr, i16 %val) {
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1386,6 +1391,7 @@ define i16 @flat_one_as_atomic_umin_i16(ptr %ptr, i16 %val) {
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1428,6 +1434,7 @@ define i16 @flat_one_as_atomic_max_i16(ptr %ptr, i16 %val) {
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
@@ -1470,6 +1477,7 @@ define i16 @flat_one_as_atomic_umax_i16(ptr %ptr, i16 %val) {
; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 3dedf008c917e..62129ebe40358 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -10,6 +10,8 @@
define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xchg_saddr_i32_nortn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -47,6 +49,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_2047:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:2047 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -85,6 +89,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i
define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:-2048 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -128,6 +134,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase
define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xchg_saddr_i32_rtn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -166,6 +174,8 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset,
define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_2048:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -205,6 +215,8 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voff
define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_neg2048:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -262,6 +274,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -276,6 +290,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -325,6 +341,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 %
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -339,6 +357,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 %
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -389,6 +409,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset,
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -402,6 +424,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset,
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v[2:3], v1 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -449,6 +473,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] offset:42 scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -462,6 +488,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v[2:3], v1 offset:42 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -532,6 +560,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB10_5
; GFX1250-SDAG-NEXT: .LBB10_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -578,6 +608,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB10_5
; GFX1250-GISEL-NEXT: .LBB10_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -712,6 +744,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB11_5
; GFX1250-SDAG-NEXT: .LBB11_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -761,6 +795,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB11_5
; GFX1250-GISEL-NEXT: .LBB11_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -896,6 +932,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: .LBB12_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB12_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -933,6 +971,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: .LBB12_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB12_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -1044,6 +1084,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: .LBB13_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB13_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -1084,6 +1126,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: .LBB13_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB13_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -1186,6 +1230,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_add_saddr_i32_rtn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -1224,6 +1270,8 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_add_saddr_i32_rtn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -1269,6 +1317,8 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_add_saddr_i32_nortn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -1305,6 +1355,8 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_add_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -1368,6 +1420,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB18_5
; GFX1250-SDAG-NEXT: .LBB18_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -1414,6 +1468,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB18_5
; GFX1250-GISEL-NEXT: .LBB18_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -1552,6 +1608,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB19_5
; GFX1250-SDAG-NEXT: .LBB19_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -1601,6 +1659,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB19_5
; GFX1250-GISEL-NEXT: .LBB19_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -1740,6 +1800,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: .LBB20_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB20_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -1780,6 +1842,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: .LBB20_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB20_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -1902,6 +1966,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB21_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -1945,6 +2011,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB21_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -2058,6 +2126,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_sub_saddr_i32_rtn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -2096,6 +2166,8 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_sub_saddr_i32_rtn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -2141,6 +2213,8 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_sub_saddr_i32_nortn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -2177,6 +2251,8 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_sub_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -2240,6 +2316,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB26_5
; GFX1250-SDAG-NEXT: .LBB26_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -2286,6 +2364,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB26_5
; GFX1250-GISEL-NEXT: .LBB26_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -2426,6 +2506,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB27_5
; GFX1250-SDAG-NEXT: .LBB27_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -2475,6 +2557,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB27_5
; GFX1250-GISEL-NEXT: .LBB27_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -2616,6 +2700,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: .LBB28_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB28_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -2656,6 +2742,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: .LBB28_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB28_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -2780,6 +2868,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: .LBB29_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB29_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -2823,6 +2913,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: .LBB29_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB29_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -2938,6 +3030,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_and_saddr_i32_rtn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -2976,6 +3070,8 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_and_saddr_i32_rtn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -3021,6 +3117,8 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_and_saddr_i32_nortn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -3057,6 +3155,8 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_and_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -3120,6 +3220,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB34_5
; GFX1250-SDAG-NEXT: .LBB34_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -3167,6 +3269,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB34_5
; GFX1250-GISEL-NEXT: .LBB34_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -3306,6 +3410,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB35_5
; GFX1250-SDAG-NEXT: .LBB35_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -3356,6 +3462,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB35_5
; GFX1250-GISEL-NEXT: .LBB35_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -3496,6 +3604,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: .LBB36_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB36_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -3537,6 +3647,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: .LBB36_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB36_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -3660,6 +3772,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: .LBB37_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB37_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -3704,6 +3818,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: .LBB37_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB37_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -3818,6 +3934,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_or_saddr_i32_rtn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -3856,6 +3974,8 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i3
define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_or_saddr_i32_rtn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -3901,6 +4021,8 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voff
define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_or_saddr_i32_nortn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -3937,6 +4059,8 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i
define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_or_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -4000,6 +4124,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB42_5
; GFX1250-SDAG-NEXT: .LBB42_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -4047,6 +4173,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB42_5
; GFX1250-GISEL-NEXT: .LBB42_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -4186,6 +4314,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB43_5
; GFX1250-SDAG-NEXT: .LBB43_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -4236,6 +4366,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB43_5
; GFX1250-GISEL-NEXT: .LBB43_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -4376,6 +4508,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-SDAG-NEXT: .LBB44_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB44_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -4417,6 +4551,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-GISEL-NEXT: .LBB44_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB44_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -4540,6 +4676,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: .LBB45_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB45_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -4584,6 +4722,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: .LBB45_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB45_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -4698,6 +4838,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xor_saddr_i32_rtn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -4736,6 +4878,8 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xor_saddr_i32_rtn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -4781,6 +4925,8 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xor_saddr_i32_nortn:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -4817,6 +4963,8 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xor_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -4880,6 +5028,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB50_5
; GFX1250-SDAG-NEXT: .LBB50_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -4927,6 +5077,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB50_5
; GFX1250-GISEL-NEXT: .LBB50_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -5066,6 +5218,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB51_5
; GFX1250-SDAG-NEXT: .LBB51_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -5116,6 +5270,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB51_5
; GFX1250-GISEL-NEXT: .LBB51_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -5256,6 +5412,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: .LBB52_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB52_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -5297,6 +5455,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: .LBB52_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB52_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -5420,6 +5580,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: .LBB53_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB53_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV
@@ -5464,6 +5626,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: .LBB53_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB53_3: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
@@ -5650,7 +5814,7 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-LABEL: flat_max_saddr_i32_nortn:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn:
@@ -5681,7 +5845,7 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-LABEL: flat_max_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn_neg128:
@@ -5735,20 +5899,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_4
; GFX1250-SDAG-NEXT: .LBB58_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB58_5
; GFX1250-SDAG-NEXT: .LBB58_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2
; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -5782,20 +5943,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_4
; GFX1250-GISEL-NEXT: .LBB58_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB58_5
; GFX1250-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2
; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -5923,20 +6081,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_4
; GFX1250-SDAG-NEXT: .LBB59_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB59_5
; GFX1250-SDAG-NEXT: .LBB59_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2
; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -5973,20 +6128,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_4
; GFX1250-GISEL-NEXT: .LBB59_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB59_5
; GFX1250-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2
; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -6119,9 +6271,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB60_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2
@@ -6158,9 +6310,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB60_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3]
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2
@@ -6279,9 +6431,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB61_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2
@@ -6321,9 +6473,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB61_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] offset:-128
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2
@@ -6504,7 +6656,7 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-LABEL: flat_min_saddr_i32_nortn:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn:
@@ -6535,7 +6687,7 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-LABEL: flat_min_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn_neg128:
@@ -6589,20 +6741,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_4
; GFX1250-SDAG-NEXT: .LBB66_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB66_5
; GFX1250-SDAG-NEXT: .LBB66_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2
; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -6636,20 +6785,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_4
; GFX1250-GISEL-NEXT: .LBB66_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB66_5
; GFX1250-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2
; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -6777,20 +6923,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_4
; GFX1250-SDAG-NEXT: .LBB67_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB67_5
; GFX1250-SDAG-NEXT: .LBB67_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2
; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -6827,20 +6970,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_4
; GFX1250-GISEL-NEXT: .LBB67_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB67_5
; GFX1250-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2
; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -6973,9 +7113,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB68_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2
@@ -7012,9 +7152,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB68_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3]
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2
@@ -7133,9 +7273,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB69_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2
@@ -7175,9 +7315,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB69_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] offset:-128
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2
@@ -7358,7 +7498,7 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-LABEL: flat_umax_saddr_i32_nortn:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn:
@@ -7389,7 +7529,7 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-LABEL: flat_umax_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn_neg128:
@@ -7443,20 +7583,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_4
; GFX1250-SDAG-NEXT: .LBB74_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB74_5
; GFX1250-SDAG-NEXT: .LBB74_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2
; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -7490,20 +7627,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_4
; GFX1250-GISEL-NEXT: .LBB74_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB74_5
; GFX1250-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2
; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -7631,20 +7765,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_4
; GFX1250-SDAG-NEXT: .LBB75_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB75_5
; GFX1250-SDAG-NEXT: .LBB75_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2
; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -7681,20 +7812,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_4
; GFX1250-GISEL-NEXT: .LBB75_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB75_5
; GFX1250-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2
; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -7827,9 +7955,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB76_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2
@@ -7866,9 +7994,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB76_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3]
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2
@@ -7987,9 +8115,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB77_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2
@@ -8029,9 +8157,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB77_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] offset:-128
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2
@@ -8212,7 +8340,7 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-LABEL: flat_umin_saddr_i32_nortn:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn:
@@ -8243,7 +8371,7 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-LABEL: flat_umin_saddr_i32_nortn_neg128:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] offset:-128
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn_neg128:
@@ -8297,20 +8425,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_4
; GFX1250-SDAG-NEXT: .LBB82_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB82_5
; GFX1250-SDAG-NEXT: .LBB82_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2
; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -8344,20 +8469,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_4
; GFX1250-GISEL-NEXT: .LBB82_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB82_5
; GFX1250-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2
; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -8485,20 +8607,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_4
; GFX1250-SDAG-NEXT: .LBB83_2: ; %atomicrmw.phi
; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_branch .LBB83_5
; GFX1250-SDAG-NEXT: .LBB83_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2
; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo
@@ -8535,20 +8654,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_4
; GFX1250-GISEL-NEXT: .LBB83_2: ; %atomicrmw.phi
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB83_5
; GFX1250-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2
; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private
; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
@@ -8681,9 +8797,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB84_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2
@@ -8720,9 +8836,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB84_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3]
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2
@@ -8841,9 +8957,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-SDAG-NEXT: s_endpgm
; GFX1250-SDAG-NEXT: .LBB85_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2
@@ -8883,9 +8999,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: s_endpgm
; GFX1250-GISEL-NEXT: .LBB85_3: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] offset:-128
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 3856f0c327495..160b35352d8a4 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1473,6 +1473,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1513,6 +1514,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1557,6 +1561,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1597,6 +1602,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1673,6 +1681,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
@@ -1765,6 +1774,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1809,6 +1821,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1849,6 +1862,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -1893,6 +1909,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
@@ -1969,6 +1986,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 {
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2063,6 +2081,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2136,6 +2157,7 @@ define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) {
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -2275,6 +2297,7 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -2307,6 +2330,7 @@ define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, doub
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -2339,6 +2363,7 @@ define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
index 97d52d5f1f26d..209775314a505 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
@@ -80,9 +80,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_acquire_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_acquire_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -151,9 +153,11 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -227,9 +231,11 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -303,9 +309,11 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -377,9 +385,11 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_one_as_acquire_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -448,9 +458,11 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_one_as_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_one_as_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -524,9 +536,11 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -600,9 +614,11 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -785,13 +801,12 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -891,14 +906,13 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -998,14 +1012,13 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1188,13 +1201,12 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_one_as_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_one_as_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1294,14 +1306,13 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_one_as_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1401,14 +1412,13 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_one_as_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1597,14 +1607,12 @@ define amdgpu_kernel void @system_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1710,15 +1718,13 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1824,15 +1830,13 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -2021,14 +2025,12 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_one_as_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_one_as_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -2134,15 +2136,13 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_one_as_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_one_as_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -2248,15 +2248,13 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_one_as_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_one_as_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index b3f6533d43887..07db15ee8e60e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1064,10 +1064,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_acquire_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_acquire_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acquire
ret void
@@ -1144,10 +1145,11 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") release
ret void
@@ -1229,10 +1231,11 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acq_rel
ret void
@@ -1314,10 +1317,11 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") seq_cst
ret void
@@ -1389,9 +1393,11 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_one_as_acquire_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acquire
ret void
@@ -1460,9 +1466,11 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_one_as_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_one_as_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") release
ret void
@@ -1536,9 +1544,11 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acq_rel
ret void
@@ -1612,9 +1622,11 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") seq_cst
ret void
@@ -1797,13 +1809,12 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") release
ret void
@@ -1903,14 +1914,13 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") acq_rel
ret void
@@ -2010,14 +2020,13 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") seq_cst
ret void
@@ -2200,13 +2209,12 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_one_as_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_one_as_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") release
ret void
@@ -2306,14 +2314,13 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_one_as_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acq_rel
ret void
@@ -2413,14 +2420,13 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: agent_one_as_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") seq_cst
ret void
@@ -2609,14 +2615,12 @@ define amdgpu_kernel void @system_release_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence release
ret void
@@ -2722,15 +2726,13 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence acq_rel
ret void
@@ -2836,15 +2838,13 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence seq_cst
ret void
@@ -3033,14 +3033,12 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_one_as_release_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_one_as_release_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") release
ret void
@@ -3146,15 +3144,13 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_one_as_acq_rel_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_one_as_acq_rel_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") acq_rel
ret void
@@ -3260,15 +3256,13 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: system_one_as_seq_cst_fence:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: system_one_as_seq_cst_fence:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") seq_cst
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 36adbc0011118..fe7fd8522bd6a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -825,23 +825,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent") seq_cst, align 4
@@ -993,15 +989,16 @@ define amdgpu_kernel void @flat_agent_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") unordered, align 4
@@ -1152,15 +1149,16 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") monotonic, align 4
@@ -1335,19 +1333,19 @@ define amdgpu_kernel void @flat_agent_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") release, align 4
@@ -1522,19 +1520,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") seq_cst, align 4
@@ -1685,15 +1683,16 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") monotonic
@@ -1875,17 +1874,18 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire
@@ -2060,19 +2060,19 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") release
@@ -2278,21 +2278,21 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel
@@ -2498,21 +2498,21 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
@@ -2722,18 +2722,19 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire
@@ -2972,24 +2973,22 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel
@@ -3228,24 +3227,22 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
@@ -3485,19 +3482,20 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3768,21 +3766,22 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4046,23 +4045,23 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4357,25 +4356,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4670,25 +4669,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4959,21 +4958,22 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5244,21 +5244,22 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5553,25 +5554,25 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5866,25 +5867,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6179,25 +6180,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6492,25 +6493,25 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6805,25 +6806,25 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7118,25 +7119,25 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7431,25 +7432,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7744,25 +7745,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8046,21 +8047,22 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8361,22 +8363,23 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8686,25 +8689,25 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9033,28 +9036,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9383,28 +9384,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9709,24 +9708,23 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10027,22 +10025,23 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10371,28 +10370,26 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10721,28 +10718,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11071,28 +11066,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11421,28 +11414,26 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11767,26 +11758,26 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12115,28 +12106,26 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12465,28 +12454,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12815,28 +12802,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -13679,24 +13664,20 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent-one-as") seq_cst, align 4
@@ -13848,15 +13829,16 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") unordered, align 4
@@ -14007,15 +13989,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") monotonic, align 4
@@ -14190,19 +14173,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") release, align 4
@@ -14377,19 +14360,19 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") seq_cst, align 4
@@ -14540,15 +14523,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") monotonic
@@ -14724,19 +14708,20 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-CU-NEXT: s_endpgm
-;
-; GFX1250-LABEL: flat_agent_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire
@@ -14911,19 +14896,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") release
@@ -15125,21 +15110,21 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -15341,21 +15326,21 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -15575,19 +15560,20 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire
@@ -15836,25 +15822,23 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -16103,25 +16087,23 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -16361,19 +16343,20 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16640,21 +16623,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16918,23 +16902,23 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17225,25 +17209,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17534,25 +17518,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17819,21 +17803,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18100,21 +18085,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18405,25 +18391,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18714,25 +18700,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19023,25 +19009,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19332,25 +19318,25 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19641,25 +19627,25 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19950,25 +19936,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20259,25 +20245,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20568,25 +20554,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20870,21 +20856,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21195,23 +21182,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21521,25 +21509,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21878,29 +21866,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22239,29 +22225,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22576,25 +22560,24 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22905,23 +22888,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23260,29 +23244,27 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23621,29 +23603,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23982,29 +23962,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24343,29 +24321,27 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24700,27 +24676,27 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25059,29 +25035,27 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25420,29 +25394,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25781,29 +25753,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
index 5526b29037977..22c1b6f9fe875 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
@@ -107,18 +107,16 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_last_use_and_volatile_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_last_use_and_volatile_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
entry:
%val = load volatile i32, ptr %in, align 4, !amdgpu.last.use !{}
store i32 %val, ptr %out
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index 964f1c8957f6f..c949790b97d72 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -1322,18 +1322,16 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_nontemporal_volatile_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_nontemporal_volatile_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load volatile i32, ptr %in, align 4, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index 871c941dd6dca..8a75db2c36dc7 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -929,15 +929,16 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") unordered, align 4
@@ -1088,15 +1089,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") monotonic, align 4
@@ -1247,15 +1249,16 @@ define amdgpu_kernel void @flat_singlethread_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") release, align 4
@@ -1406,15 +1409,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") seq_cst, align 4
@@ -1565,15 +1569,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") monotonic
@@ -1724,15 +1729,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire
@@ -1883,15 +1889,16 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") release
@@ -2042,15 +2049,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel
@@ -2201,15 +2209,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst
@@ -2404,17 +2413,18 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire
@@ -2610,17 +2620,18 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel
@@ -2816,17 +2827,18 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst
@@ -3066,19 +3078,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3318,19 +3331,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3570,19 +3584,20 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3822,19 +3837,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4074,19 +4090,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4326,19 +4343,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4578,19 +4596,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4830,19 +4849,20 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5082,19 +5102,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5334,19 +5355,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5586,19 +5608,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5838,19 +5861,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6090,19 +6114,20 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6342,19 +6367,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6594,19 +6620,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6890,21 +6917,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7190,21 +7218,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7490,21 +7519,22 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7790,21 +7820,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8090,21 +8121,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8390,21 +8422,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8690,21 +8723,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8990,21 +9024,22 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9290,21 +9325,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9590,21 +9626,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9890,21 +9927,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10190,21 +10228,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10490,21 +10529,22 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10790,21 +10830,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11090,21 +11131,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12030,15 +12072,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") unordered, align 4
@@ -12189,15 +12232,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") monotonic, align 4
@@ -12348,15 +12392,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") release, align 4
@@ -12507,15 +12552,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -12666,15 +12712,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -12825,15 +12872,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -12984,15 +13032,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") release
@@ -13143,15 +13192,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -13302,15 +13352,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -13505,17 +13556,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -13711,17 +13763,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -13917,17 +13970,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -14167,19 +14221,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14419,19 +14474,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14671,19 +14727,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14923,19 +14980,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15175,19 +15233,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15427,19 +15486,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15679,19 +15739,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15931,19 +15992,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16183,19 +16245,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16435,19 +16498,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16687,19 +16751,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16939,19 +17004,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17191,19 +17257,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17443,19 +17510,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17695,19 +17763,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17991,21 +18060,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18291,21 +18361,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18591,21 +18662,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18891,21 +18963,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19191,21 +19264,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19491,21 +19565,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19791,21 +19866,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20091,21 +20167,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20391,21 +20468,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20691,21 +20769,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20991,21 +21070,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21291,21 +21371,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21591,21 +21672,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21891,21 +21973,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22191,21 +22274,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index 9d70a2437e553..b5ea23d4655b6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -829,23 +829,19 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in seq_cst, align 4
@@ -997,15 +993,16 @@ define amdgpu_kernel void @flat_system_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out unordered, align 4
@@ -1156,15 +1153,16 @@ define amdgpu_kernel void @flat_system_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out monotonic, align 4
@@ -1343,20 +1341,19 @@ define amdgpu_kernel void @flat_system_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out release, align 4
@@ -1535,20 +1532,19 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out seq_cst, align 4
@@ -1699,15 +1695,16 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in monotonic
@@ -1891,17 +1888,18 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acquire
@@ -2080,20 +2078,19 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in release
@@ -2305,22 +2302,21 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel
@@ -2532,22 +2528,21 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
@@ -2759,18 +2754,19 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acquire
@@ -3015,25 +3011,22 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel
@@ -3278,25 +3271,22 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
@@ -3536,19 +3526,20 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3821,21 +3812,22 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4103,24 +4095,23 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4421,26 +4412,25 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4741,26 +4731,25 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5033,21 +5022,22 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5320,21 +5310,22 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5635,26 +5626,25 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5955,26 +5945,25 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6275,26 +6264,25 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6595,26 +6583,25 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6915,26 +6902,25 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7235,26 +7221,25 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7555,26 +7540,25 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7875,26 +7859,25 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8178,21 +8161,22 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8495,22 +8479,23 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8824,26 +8809,25 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9178,29 +9162,26 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9535,29 +9516,26 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9864,24 +9842,23 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10184,22 +10161,23 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10534,29 +10512,26 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10891,29 +10866,26 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11248,29 +11220,26 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11605,29 +11574,26 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11958,27 +11924,26 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12313,29 +12278,26 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12670,29 +12632,26 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -13027,29 +12986,26 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -13896,24 +13852,20 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("one-as") seq_cst, align 4
@@ -14065,15 +14017,16 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") unordered, align 4
@@ -14224,15 +14177,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") monotonic, align 4
@@ -14411,20 +14365,19 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") release, align 4
@@ -14603,20 +14556,19 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") seq_cst, align 4
@@ -14767,15 +14719,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") monotonic
@@ -14953,19 +14906,20 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-CU-NEXT: s_endpgm
-;
-; GFX1250-LABEL: flat_system_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX12-CU-NEXT: s_endpgm
+;
+; GFX1250-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire
@@ -15144,20 +15098,19 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") release
@@ -15365,22 +15318,21 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel
@@ -15588,22 +15540,21 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst
@@ -15825,19 +15776,20 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire
@@ -16092,26 +16044,23 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel
@@ -16366,26 +16315,23 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst
@@ -16625,19 +16571,20 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16906,21 +16853,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17188,24 +17136,23 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17502,26 +17449,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17818,26 +17764,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18106,21 +18051,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18389,21 +18335,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18700,26 +18647,25 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19016,26 +18962,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19332,26 +19277,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19648,26 +19592,25 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19964,26 +19907,25 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20280,26 +20222,25 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20596,26 +20537,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20912,26 +20852,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21215,21 +21154,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21542,23 +21482,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21872,26 +21813,25 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22236,30 +22176,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22604,30 +22541,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22944,25 +22878,24 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23275,23 +23208,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23636,30 +23570,27 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24004,30 +23935,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24372,30 +24300,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24740,30 +24665,27 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25104,28 +25026,27 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25470,30 +25391,27 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25838,30 +25756,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -26206,30 +26121,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index 77f52e4d4b9fd..68af003ba6353 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -145,18 +145,16 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_nontemporal_load_0:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_nontemporal_load_0:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load volatile i32, ptr %in, align 4
@@ -422,22 +420,20 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_nontemporal_load_1:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v1, v0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
-; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_nontemporal_load_1:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff
+; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1144,16 +1140,18 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_volatile_workgroup_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_volatile_workgroup_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic volatile i32 %in, ptr %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index f086542b3d1f8..a4804675fd3cf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -929,15 +929,16 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") unordered, align 4
@@ -1088,15 +1089,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") monotonic, align 4
@@ -1247,15 +1249,16 @@ define amdgpu_kernel void @flat_wavefront_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") release, align 4
@@ -1406,15 +1409,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") seq_cst, align 4
@@ -1565,15 +1569,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") monotonic
@@ -1724,15 +1729,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire
@@ -1883,15 +1889,16 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") release
@@ -2042,15 +2049,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel
@@ -2201,15 +2209,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst
@@ -2404,17 +2413,18 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire
@@ -2610,17 +2620,18 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel
@@ -2816,17 +2827,18 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst
@@ -3066,19 +3078,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3318,19 +3331,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3570,19 +3584,20 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3822,19 +3837,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4074,19 +4090,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4326,19 +4343,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4578,19 +4596,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4830,19 +4849,20 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5082,19 +5102,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5334,19 +5355,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5586,19 +5608,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5838,19 +5861,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6090,19 +6114,20 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6342,19 +6367,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6594,19 +6620,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6890,21 +6917,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7190,21 +7218,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7490,21 +7519,22 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7790,21 +7820,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8090,21 +8121,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8390,21 +8422,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8690,21 +8723,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8990,21 +9024,22 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9290,21 +9325,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9590,21 +9626,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9890,21 +9927,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10190,21 +10228,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10490,21 +10529,22 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10790,21 +10830,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11090,21 +11131,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12030,15 +12072,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") unordered, align 4
@@ -12189,15 +12232,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") monotonic, align 4
@@ -12348,15 +12392,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") release, align 4
@@ -12507,15 +12552,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -12666,15 +12712,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -12825,15 +12872,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -12984,15 +13032,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") release
@@ -13143,15 +13192,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -13302,15 +13352,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -13505,17 +13556,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -13711,17 +13763,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -13917,17 +13970,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -14167,19 +14221,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14419,19 +14474,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14671,19 +14727,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14923,19 +14980,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15175,19 +15233,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15427,19 +15486,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15679,19 +15739,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15931,19 +15992,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16183,19 +16245,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16435,19 +16498,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16687,19 +16751,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16939,19 +17004,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17191,19 +17257,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17443,19 +17510,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17695,19 +17763,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17991,21 +18060,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18291,21 +18361,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18591,21 +18662,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18891,21 +18963,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19191,21 +19264,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19491,21 +19565,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19791,21 +19866,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20091,21 +20167,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20391,21 +20468,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20691,21 +20769,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20991,21 +21070,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21291,21 +21371,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21591,21 +21672,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21891,21 +21973,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index d8e6ad043e061..01801637ce770 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -811,17 +811,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup") seq_cst, align 4
@@ -973,15 +974,16 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") unordered, align 4
@@ -1132,15 +1134,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") monotonic, align 4
@@ -1308,16 +1311,18 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") release, align 4
@@ -1485,16 +1490,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") seq_cst, align 4
@@ -1645,15 +1652,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") monotonic
@@ -1823,16 +1831,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
@@ -2000,16 +2009,18 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") release
@@ -2196,17 +2207,19 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
@@ -2393,17 +2406,19 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
@@ -2610,17 +2625,18 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
@@ -2847,18 +2863,20 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
@@ -3085,18 +3103,20 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
@@ -3336,19 +3356,20 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3607,20 +3628,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3877,20 +3899,22 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4166,21 +4190,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4456,21 +4482,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4729,20 +4757,21 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5001,20 +5030,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5290,21 +5320,23 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5580,21 +5612,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5870,21 +5904,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6160,21 +6196,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6458,21 +6496,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6770,21 +6809,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7087,22 +7127,24 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7419,22 +7461,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7751,22 +7795,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8066,21 +8112,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8378,21 +8425,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8709,22 +8757,24 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9041,22 +9091,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9373,22 +9425,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9705,22 +9759,24 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10035,22 +10091,24 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10367,22 +10425,24 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10699,22 +10759,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11031,22 +11093,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11839,16 +11903,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -12000,15 +12066,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") unordered, align 4
@@ -12159,15 +12226,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") monotonic, align 4
@@ -12328,15 +12396,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") release, align 4
@@ -12497,15 +12568,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -12656,15 +12730,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -12825,15 +12900,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -12994,15 +13071,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") release
@@ -13173,15 +13253,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -13352,15 +13436,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -13563,17 +13651,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -13789,17 +13878,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -14015,17 +14107,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -14265,19 +14360,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14527,19 +14623,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14789,19 +14887,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15061,19 +15162,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15333,19 +15438,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15595,19 +15704,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15857,19 +15968,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16129,19 +16242,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16401,19 +16518,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16673,19 +16794,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16945,19 +17070,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17217,19 +17346,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17489,19 +17622,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17761,19 +17898,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18033,19 +18174,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18329,21 +18474,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18637,21 +18783,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18947,21 +19094,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19267,21 +19417,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19587,21 +19740,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19897,21 +20053,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20205,21 +20362,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20525,21 +20683,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20845,21 +21006,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21165,21 +21329,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21485,21 +21652,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21803,21 +21973,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22123,21 +22296,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22443,21 +22619,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22763,21 +22942,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 184e15406bfbc..ad163cefe57d4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -829,23 +829,19 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4
@@ -1004,15 +1000,16 @@ define amdgpu_kernel void @global_agent_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") unordered, align 4
@@ -1170,15 +1167,16 @@ define amdgpu_kernel void @global_agent_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 4
@@ -1361,19 +1359,19 @@ define amdgpu_kernel void @global_agent_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") release, align 4
@@ -1556,19 +1554,19 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 4
@@ -1724,15 +1722,16 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") monotonic
@@ -1917,17 +1916,18 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire
@@ -2108,19 +2108,19 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") release
@@ -2330,21 +2330,21 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel
@@ -2554,21 +2554,21 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
@@ -2768,18 +2768,19 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire
@@ -3009,24 +3010,22 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel
@@ -3256,24 +3255,22 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
@@ -3494,19 +3491,20 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3756,21 +3754,22 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4016,23 +4015,23 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4307,25 +4306,25 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4600,25 +4599,25 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4868,21 +4867,22 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5132,21 +5132,22 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5421,25 +5422,25 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5714,25 +5715,25 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6007,25 +6008,25 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6300,25 +6301,25 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6593,25 +6594,25 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6886,25 +6887,25 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7179,25 +7180,25 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7472,25 +7473,25 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7741,21 +7742,22 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8024,22 +8026,23 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8317,25 +8320,25 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8633,28 +8636,26 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8952,28 +8953,26 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9246,24 +9245,23 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9532,22 +9530,23 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9845,28 +9844,26 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10164,28 +10161,26 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10483,28 +10478,26 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10802,28 +10795,26 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11117,26 +11108,26 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11434,28 +11425,26 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11753,28 +11742,26 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -12072,28 +12059,26 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -12918,23 +12903,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") seq_cst, align 4
@@ -13093,15 +13074,16 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") unordered, align 4
@@ -13259,15 +13241,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") monotonic, align 4
@@ -13450,19 +13433,19 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") release, align 4
@@ -13645,19 +13628,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") seq_cst, align 4
@@ -13813,15 +13796,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") monotonic
@@ -14006,17 +13990,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire
@@ -14197,19 +14182,19 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") release
@@ -14419,21 +14404,21 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -14643,21 +14628,21 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -14857,18 +14842,19 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire
@@ -15098,24 +15084,22 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -15345,24 +15329,22 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -15583,19 +15565,20 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15845,21 +15828,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16105,23 +16089,23 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16396,25 +16380,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16689,25 +16673,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16957,21 +16941,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17221,21 +17206,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17510,25 +17496,25 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17803,25 +17789,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18096,25 +18082,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18389,25 +18375,25 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18682,25 +18668,25 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18975,25 +18961,25 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19268,25 +19254,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19561,25 +19547,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19830,21 +19816,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20113,22 +20100,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20426,28 +20414,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20745,28 +20731,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21039,24 +21023,23 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21325,22 +21308,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21638,28 +21622,26 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21957,28 +21939,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22276,28 +22256,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22595,28 +22573,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22910,26 +22886,26 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -23227,28 +23203,26 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -23546,28 +23520,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -23865,28 +23837,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
index ed2d62356f8f2..bda702156905a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
@@ -87,18 +87,16 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_last_use_and_volatile_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_last_use_and_volatile_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{}
store i32 %val, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index c1bfe21865c15..4575cbbfd839e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -1105,18 +1105,16 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_nontemporal_volatile_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_nontemporal_volatile_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index 6a5a6e01c741b..4f2ea4493560f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -945,15 +945,16 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") unordered, align 4
@@ -1111,15 +1112,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") monotonic, align 4
@@ -1277,15 +1279,16 @@ define amdgpu_kernel void @global_singlethread_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") release, align 4
@@ -1443,15 +1446,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") seq_cst, align 4
@@ -1607,15 +1611,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") monotonic
@@ -1771,15 +1776,16 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire
@@ -1935,15 +1941,16 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") release
@@ -2099,15 +2106,16 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel
@@ -2263,15 +2271,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst
@@ -2455,17 +2464,18 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire
@@ -2650,17 +2660,18 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel
@@ -2845,17 +2856,18 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst
@@ -3076,19 +3088,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3309,19 +3322,20 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3542,19 +3556,20 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3775,19 +3790,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4008,19 +4024,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4241,19 +4258,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4474,19 +4492,20 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4707,19 +4726,20 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4940,19 +4960,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5173,19 +5194,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5406,19 +5428,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5639,19 +5662,20 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5872,19 +5896,20 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6105,19 +6130,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6338,19 +6364,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6601,21 +6628,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6868,21 +6896,22 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7135,21 +7164,22 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7402,21 +7432,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7669,21 +7700,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7936,21 +7968,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8203,21 +8236,22 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8470,21 +8504,22 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8737,21 +8772,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9004,21 +9040,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9271,21 +9308,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9538,21 +9576,22 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9805,21 +9844,22 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10072,21 +10112,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10339,21 +10380,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11294,15 +11336,16 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") unordered, align 4
@@ -11460,15 +11503,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") monotonic, align 4
@@ -11626,15 +11670,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") release, align 4
@@ -11792,15 +11837,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -11956,15 +12002,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -12120,15 +12167,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -12284,15 +12332,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") release
@@ -12448,15 +12497,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -12612,15 +12662,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -12804,17 +12855,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -12999,17 +13051,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -13194,17 +13247,18 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -13425,19 +13479,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13658,19 +13713,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13891,19 +13947,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14124,19 +14181,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14357,19 +14415,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14590,19 +14649,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14823,19 +14883,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15056,19 +15117,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15289,19 +15351,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15522,19 +15585,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15755,19 +15819,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15988,19 +16053,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16221,19 +16287,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16454,19 +16521,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16687,19 +16755,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16950,21 +17019,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17217,21 +17287,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17484,21 +17555,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17751,21 +17823,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18018,21 +18091,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18285,21 +18359,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18552,21 +18627,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18819,21 +18895,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19086,21 +19163,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19353,21 +19431,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19620,21 +19699,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19887,21 +19967,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20154,21 +20235,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20421,21 +20503,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20688,21 +20771,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 7ddd515830e11..c8a45deccb462 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -833,23 +833,19 @@ define amdgpu_kernel void @global_system_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in seq_cst, align 4
@@ -1008,15 +1004,16 @@ define amdgpu_kernel void @global_system_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out unordered, align 4
@@ -1174,15 +1171,16 @@ define amdgpu_kernel void @global_system_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out monotonic, align 4
@@ -1369,20 +1367,19 @@ define amdgpu_kernel void @global_system_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out release, align 4
@@ -1569,20 +1566,19 @@ define amdgpu_kernel void @global_system_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4
@@ -1738,15 +1734,16 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in monotonic
@@ -1933,17 +1930,18 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire
@@ -2128,20 +2126,19 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in release
@@ -2357,22 +2354,21 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel
@@ -2588,22 +2584,21 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
@@ -2805,18 +2800,19 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire
@@ -3052,25 +3048,22 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel
@@ -3306,25 +3299,22 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
@@ -3545,19 +3535,20 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3809,21 +3800,22 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4073,24 +4065,23 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4371,26 +4362,25 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4671,26 +4661,25 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4942,21 +4931,22 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5208,21 +5198,22 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5503,26 +5494,25 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5803,26 +5793,25 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6103,26 +6092,25 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6403,26 +6391,25 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6673,21 +6660,22 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6958,22 +6946,23 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7277,29 +7266,26 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7603,29 +7589,26 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7900,24 +7883,23 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8188,22 +8170,23 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8507,29 +8490,26 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8833,29 +8813,26 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9159,29 +9136,26 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9485,29 +9459,26 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9807,27 +9778,26 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10131,29 +10101,26 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10457,29 +10424,26 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10783,29 +10747,26 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11634,23 +11595,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") seq_cst, align 4
@@ -11809,15 +11766,16 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") unordered, align 4
@@ -11975,15 +11933,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") monotonic, align 4
@@ -12170,20 +12129,19 @@ define amdgpu_kernel void @global_system_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") release, align 4
@@ -12370,20 +12328,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") seq_cst, align 4
@@ -12539,15 +12496,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") monotonic
@@ -12734,17 +12692,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire
@@ -12929,20 +12888,19 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") release
@@ -13158,22 +13116,21 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel
@@ -13389,22 +13346,21 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst
@@ -13606,18 +13562,19 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire
@@ -13853,25 +13810,22 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel
@@ -14107,25 +14061,22 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst
@@ -14346,19 +14297,20 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14610,21 +14562,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14874,24 +14827,23 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15172,26 +15124,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15472,26 +15423,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15743,21 +15693,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16009,21 +15960,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16304,26 +16256,25 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16604,26 +16555,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16904,26 +16854,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17204,26 +17153,25 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17504,26 +17452,25 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17804,26 +17751,25 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18104,26 +18050,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18404,26 +18349,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18674,21 +18618,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18959,22 +18904,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19256,26 +19202,25 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19579,29 +19524,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19905,29 +19847,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20202,24 +20141,23 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20490,22 +20428,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20809,29 +20748,26 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21135,29 +21071,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21461,29 +21394,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21787,29 +21717,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22109,27 +22036,26 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22433,29 +22359,26 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22759,29 +22682,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -23085,29 +23005,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_storecnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 1539fb574c0bd..f4fdec7490117 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -148,18 +148,16 @@ define amdgpu_kernel void @global_volatile_load_0(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_volatile_load_0:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_volatile_load_0:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4
@@ -357,22 +355,20 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_volatile_load_1:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v1, v0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
-; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_volatile_load_1:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff
+; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1034,16 +1030,18 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_volatile_workgroup_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_volatile_workgroup_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic volatile i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index 1aa8305b1a837..f66e6d00e6eab 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -945,15 +945,16 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") unordered, align 4
@@ -1111,15 +1112,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 4
@@ -1277,15 +1279,16 @@ define amdgpu_kernel void @global_wavefront_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") release, align 4
@@ -1443,15 +1446,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") seq_cst, align 4
@@ -1607,15 +1611,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") monotonic
@@ -1771,15 +1776,16 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire
@@ -1935,15 +1941,16 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") release
@@ -2099,15 +2106,16 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel
@@ -2263,15 +2271,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst
@@ -2455,17 +2464,18 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire
@@ -2650,17 +2660,18 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel
@@ -2845,17 +2856,18 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst
@@ -3076,19 +3088,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3309,19 +3322,20 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3542,19 +3556,20 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3775,19 +3790,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4008,19 +4024,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4241,19 +4258,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4474,19 +4492,20 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4707,19 +4726,20 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4940,19 +4960,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5173,19 +5194,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5406,19 +5428,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5639,19 +5662,20 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5872,19 +5896,20 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6105,19 +6130,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6338,19 +6364,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6601,21 +6628,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6868,21 +6896,22 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7135,21 +7164,22 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7402,21 +7432,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7669,21 +7700,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7936,21 +7968,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8203,21 +8236,22 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8470,21 +8504,22 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8737,21 +8772,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9004,21 +9040,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9271,21 +9308,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9538,21 +9576,22 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9805,21 +9844,22 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10072,21 +10112,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10339,21 +10380,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11294,15 +11336,16 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") unordered, align 4
@@ -11460,15 +11503,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") monotonic, align 4
@@ -11626,15 +11670,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") release, align 4
@@ -11792,15 +11837,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -11956,15 +12002,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -12120,15 +12167,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -12284,15 +12332,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") release
@@ -12448,15 +12497,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -12612,15 +12662,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -12804,17 +12855,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -12999,17 +13051,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -13194,17 +13247,18 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -13425,19 +13479,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13658,19 +13713,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13891,19 +13947,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14124,19 +14181,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14357,19 +14415,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14590,19 +14649,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14823,19 +14883,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15056,19 +15117,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15289,19 +15351,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15522,19 +15585,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15755,19 +15819,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15988,19 +16053,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16221,19 +16287,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16454,19 +16521,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16687,19 +16755,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16950,21 +17019,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17217,21 +17287,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17484,21 +17555,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17751,21 +17823,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18018,21 +18091,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18285,21 +18359,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18552,21 +18627,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18819,21 +18895,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19086,21 +19163,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19353,21 +19431,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19620,21 +19699,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19887,21 +19967,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20154,21 +20235,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20421,21 +20503,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20688,21 +20771,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 3eab16e6b9713..bbbf8cf7f5cb1 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -799,17 +799,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") seq_cst, align 4
@@ -968,15 +969,16 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") unordered, align 4
@@ -1134,15 +1136,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") monotonic, align 4
@@ -1318,16 +1321,18 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4
@@ -1503,16 +1508,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") seq_cst, align 4
@@ -1668,15 +1675,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") monotonic
@@ -1842,15 +1850,17 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire
@@ -2024,16 +2034,18 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") release
@@ -2217,16 +2229,19 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2410,16 +2425,19 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
@@ -2608,17 +2626,18 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire
@@ -2828,18 +2847,20 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel
@@ -3049,18 +3070,20 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
@@ -3281,19 +3304,20 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3524,19 +3548,21 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3775,20 +3801,22 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4037,20 +4065,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4299,20 +4330,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4543,19 +4577,21 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4786,19 +4822,21 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5047,20 +5085,23 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5309,20 +5350,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5571,20 +5615,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5833,20 +5880,23 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6095,20 +6145,23 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6357,20 +6410,23 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6619,20 +6675,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6881,20 +6940,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7145,21 +7207,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7417,21 +7480,22 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7702,22 +7766,24 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7995,22 +8061,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8288,22 +8356,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8563,21 +8633,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8835,21 +8906,22 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9127,22 +9199,24 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9420,22 +9494,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9713,22 +9789,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10006,22 +10084,24 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10297,22 +10377,24 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10590,22 +10672,24 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10883,22 +10967,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11176,22 +11262,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11982,16 +12070,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -12150,15 +12240,16 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_unordered_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_unordered_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") unordered, align 4
@@ -12316,15 +12407,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_monotonic_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") monotonic, align 4
@@ -12492,15 +12584,18 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") release, align 4
@@ -12668,15 +12763,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -12832,15 +12930,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -13006,15 +13105,17 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acquire_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -13180,15 +13281,18 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") release
@@ -13364,15 +13468,19 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -13548,15 +13656,19 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -13745,17 +13857,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -13957,17 +14070,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -14169,17 +14285,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -14400,19 +14519,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14643,19 +14763,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14886,19 +15008,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15139,19 +15264,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15392,19 +15521,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15635,19 +15768,21 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15878,19 +16013,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16131,19 +16268,23 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16384,19 +16525,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16637,19 +16782,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16890,19 +17039,23 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17143,19 +17296,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17396,19 +17553,23 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17649,19 +17810,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17902,19 +18067,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18165,21 +18334,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18437,21 +18607,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18714,21 +18885,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18998,21 +19172,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19282,21 +19459,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19556,21 +19736,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19828,21 +20009,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20112,21 +20294,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20396,21 +20581,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20680,21 +20868,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20964,21 +21155,24 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21246,21 +21440,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21530,21 +21727,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21814,21 +22014,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22098,21 +22301,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_xcnt 0x0
+; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
+; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index 102616b9a2065..7428ddc780675 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -756,18 +756,19 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_load_b32 v1, v0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_load_b32 v1, v0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("agent") seq_cst, align 4
@@ -1228,16 +1229,17 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") release, align 4
@@ -1397,16 +1399,17 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") seq_cst, align 4
@@ -1883,16 +1886,17 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") release
@@ -2068,17 +2072,18 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel
@@ -2254,17 +2259,18 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst
@@ -2672,19 +2678,20 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel
@@ -2892,19 +2899,20 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst
@@ -3466,18 +3474,19 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3680,19 +3689,20 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3895,19 +3905,20 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4502,19 +4513,20 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4717,19 +4729,20 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4932,19 +4945,20 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5147,19 +5161,20 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5362,19 +5377,20 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5577,19 +5593,20 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5792,19 +5809,20 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6007,19 +6025,20 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6705,21 +6724,22 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6955,21 +6975,22 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7205,21 +7226,22 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7917,21 +7939,22 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8167,21 +8190,22 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8417,21 +8441,22 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8667,21 +8692,22 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8917,21 +8943,22 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9167,21 +9194,22 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9417,21 +9445,22 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9667,21 +9696,22 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 1356fe4854170..d57736ba0230c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -756,18 +756,19 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_load_b32 v1, v0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_load_b32 v1, v0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in seq_cst, align 4
@@ -1228,16 +1229,17 @@ define amdgpu_kernel void @local_system_release_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out release, align 4
@@ -1397,16 +1399,17 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out seq_cst, align 4
@@ -1883,16 +1886,17 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in release
@@ -2068,17 +2072,18 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel
@@ -2254,17 +2259,18 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst
@@ -2672,19 +2678,20 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel
@@ -2892,19 +2899,20 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst
@@ -3466,18 +3474,19 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3680,19 +3689,20 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3895,19 +3905,20 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4502,19 +4513,20 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4717,19 +4729,20 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4932,19 +4945,20 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5147,19 +5161,20 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5362,19 +5377,20 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5577,19 +5593,20 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5792,19 +5809,20 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6007,19 +6025,20 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6705,21 +6724,22 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6955,21 +6975,22 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7205,21 +7226,22 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7917,21 +7939,22 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8167,21 +8190,22 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8417,21 +8441,22 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8667,21 +8692,22 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8917,21 +8943,22 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9167,21 +9194,22 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9417,21 +9445,22 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9667,21 +9696,22 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 75e28f9008e28..d8ba02adf4b35 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -883,16 +883,17 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_volatile_workgroup_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_volatile_workgroup_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic volatile i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 6aaf9d323b1fd..7220c071bf657 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -756,18 +756,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_load:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_load_b32 v1, v0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_load:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_load_b32 v1, v0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") seq_cst, align 4
@@ -1228,16 +1229,17 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_release_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_release_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4
@@ -1397,16 +1399,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_store:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_store:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") seq_cst, align 4
@@ -1883,16 +1886,17 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_release_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_release_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") release
@@ -2068,17 +2072,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acq_rel_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2254,17 +2259,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst
@@ -2672,19 +2678,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2892,19 +2899,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst
@@ -3466,18 +3474,19 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_release_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_release_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3680,19 +3689,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3895,19 +3905,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4502,19 +4513,20 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_release_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_release_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4717,19 +4729,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4932,19 +4945,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5147,19 +5161,20 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5362,19 +5377,20 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5577,19 +5593,20 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_release_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5792,19 +5809,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6007,19 +6025,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6705,21 +6724,22 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6955,21 +6975,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7205,21 +7226,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7917,21 +7939,22 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8167,21 +8190,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8417,21 +8441,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8667,21 +8692,22 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8917,21 +8943,22 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9167,21 +9194,22 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9417,21 +9445,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9667,21 +9696,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-NEXT: ds_store_b32 v0, v1
-; GFX1250-NEXT: s_endpgm
+; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250-CU: ; %bb.0: ; %entry
+; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-CU-NEXT: s_wait_storecnt 0x0
+; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-CU-NEXT: s_wait_dscnt 0x0
+; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-CU-NEXT: ds_store_b32 v0, v1
+; GFX1250-CU-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
>From df1db640dbfeb785251e02a254663c0046c01149 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 21 Aug 2025 12:35:47 +0200
Subject: [PATCH 2/5] clang-format
---
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 95fa03fc97e69..0451b27bc81c5 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2412,7 +2412,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
- case SIAtomicScope::WORKGROUP:
+ case SIAtomicScope::WORKGROUP:
// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
// not needed as LDS operations for all waves are executed in a total
// global ordering as observed by all waves. Required if also
>From 52f20d7b43b03e8e5e31a7819ae9fd78e3f15192 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 21 Aug 2025 12:44:08 +0200
Subject: [PATCH 3/5] Drop -CU suffix
---
.../memory-legalizer-fence-mmra-global.ll | 240 +-
.../CodeGen/AMDGPU/memory-legalizer-fence.ll | 240 +-
.../AMDGPU/memory-legalizer-flat-agent.ll | 2930 ++++++++---------
.../AMDGPU/memory-legalizer-flat-lastuse.ll | 20 +-
.../memory-legalizer-flat-nontemporal.ll | 20 +-
.../memory-legalizer-flat-singlethread.ll | 2304 ++++++-------
.../AMDGPU/memory-legalizer-flat-system.ll | 2930 ++++++++---------
.../AMDGPU/memory-legalizer-flat-volatile.ll | 72 +-
.../AMDGPU/memory-legalizer-flat-wavefront.ll | 2272 ++++++-------
.../AMDGPU/memory-legalizer-flat-workgroup.ll | 2512 +++++++-------
.../AMDGPU/memory-legalizer-global-agent.ll | 2868 ++++++++--------
.../AMDGPU/memory-legalizer-global-lastuse.ll | 20 +-
.../memory-legalizer-global-nontemporal.ll | 20 +-
.../memory-legalizer-global-singlethread.ll | 2304 ++++++-------
.../AMDGPU/memory-legalizer-global-system.ll | 2706 +++++++--------
.../memory-legalizer-global-volatile.ll | 72 +-
.../memory-legalizer-global-wavefront.ll | 2304 ++++++-------
.../memory-legalizer-global-workgroup.ll | 2648 +++++++--------
.../AMDGPU/memory-legalizer-local-agent.ll | 854 ++---
.../AMDGPU/memory-legalizer-local-system.ll | 854 ++---
.../AMDGPU/memory-legalizer-local-volatile.ll | 22 +-
.../memory-legalizer-local-workgroup.ll | 854 ++---
22 files changed, 14533 insertions(+), 14533 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
index 209775314a505..6a76f4307dcad 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
@@ -80,11 +80,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_acquire_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -153,11 +153,11 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -231,11 +231,11 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -309,11 +309,11 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -385,11 +385,11 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_one_as_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -458,11 +458,11 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_one_as_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -536,11 +536,11 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -614,11 +614,11 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -801,12 +801,12 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -906,13 +906,13 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1012,13 +1012,13 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1201,12 +1201,12 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_one_as_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1306,13 +1306,13 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1412,13 +1412,13 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1607,12 +1607,12 @@ define amdgpu_kernel void @system_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1718,13 +1718,13 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -1830,13 +1830,13 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -2025,12 +2025,12 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_one_as_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -2136,13 +2136,13 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_one_as_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
@@ -2248,13 +2248,13 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_one_as_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"}
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index 07db15ee8e60e..736a8b58466dd 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1064,11 +1064,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_acquire_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acquire
ret void
@@ -1145,11 +1145,11 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") release
ret void
@@ -1231,11 +1231,11 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acq_rel
ret void
@@ -1317,11 +1317,11 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup") seq_cst
ret void
@@ -1393,11 +1393,11 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_one_as_acquire_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acquire
ret void
@@ -1466,11 +1466,11 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_one_as_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") release
ret void
@@ -1544,11 +1544,11 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acq_rel
ret void
@@ -1622,11 +1622,11 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") seq_cst
ret void
@@ -1809,12 +1809,12 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") release
ret void
@@ -1914,13 +1914,13 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") acq_rel
ret void
@@ -2020,13 +2020,13 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent") seq_cst
ret void
@@ -2209,12 +2209,12 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_one_as_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") release
ret void
@@ -2314,13 +2314,13 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acq_rel
ret void
@@ -2420,13 +2420,13 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: agent_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") seq_cst
ret void
@@ -2615,12 +2615,12 @@ define amdgpu_kernel void @system_release_fence() {
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence release
ret void
@@ -2726,13 +2726,13 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence acq_rel
ret void
@@ -2838,13 +2838,13 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence seq_cst
ret void
@@ -3033,12 +3033,12 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_one_as_release_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_one_as_release_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") release
ret void
@@ -3144,13 +3144,13 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_one_as_acq_rel_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_one_as_acq_rel_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") acq_rel
ret void
@@ -3256,13 +3256,13 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: system_one_as_seq_cst_fence:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: system_one_as_seq_cst_fence:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
entry:
fence syncscope("one-as") seq_cst
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index fe7fd8522bd6a..55ec0c2255f9b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -825,19 +825,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent") seq_cst, align 4
@@ -989,16 +989,16 @@ define amdgpu_kernel void @flat_agent_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") unordered, align 4
@@ -1149,16 +1149,16 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") monotonic, align 4
@@ -1333,19 +1333,19 @@ define amdgpu_kernel void @flat_agent_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") release, align 4
@@ -1520,19 +1520,19 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent") seq_cst, align 4
@@ -1683,16 +1683,16 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") monotonic
@@ -1874,18 +1874,18 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire
@@ -2060,19 +2060,19 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") release
@@ -2278,21 +2278,21 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel
@@ -2498,21 +2498,21 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
@@ -2722,19 +2722,19 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire
@@ -2973,22 +2973,22 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel
@@ -3227,22 +3227,22 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
@@ -3482,20 +3482,20 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3766,22 +3766,22 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4045,23 +4045,23 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4356,25 +4356,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4669,25 +4669,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4958,22 +4958,22 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5244,22 +5244,22 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5554,25 +5554,25 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5867,25 +5867,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6180,25 +6180,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6493,25 +6493,25 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6806,25 +6806,25 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7119,25 +7119,25 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7432,25 +7432,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7745,25 +7745,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8047,22 +8047,22 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8363,23 +8363,23 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8689,25 +8689,25 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9036,26 +9036,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9384,26 +9384,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9708,23 +9708,23 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10025,23 +10025,23 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10370,26 +10370,26 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10718,26 +10718,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11066,26 +11066,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11414,26 +11414,26 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11758,26 +11758,26 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12106,26 +12106,26 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12454,26 +12454,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12802,26 +12802,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -13664,20 +13664,20 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("agent-one-as") seq_cst, align 4
@@ -13829,16 +13829,16 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") unordered, align 4
@@ -13989,16 +13989,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") monotonic, align 4
@@ -14173,19 +14173,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") release, align 4
@@ -14360,19 +14360,19 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("agent-one-as") seq_cst, align 4
@@ -14523,16 +14523,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") monotonic
@@ -14710,18 +14710,18 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire
@@ -14896,19 +14896,19 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") release
@@ -15110,21 +15110,21 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -15326,21 +15326,21 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -15560,20 +15560,20 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire
@@ -15822,23 +15822,23 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -16087,23 +16087,23 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -16343,20 +16343,20 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16623,22 +16623,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16902,23 +16902,23 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17209,25 +17209,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17518,25 +17518,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17803,22 +17803,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18085,22 +18085,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18391,25 +18391,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18700,25 +18700,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19009,25 +19009,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19318,25 +19318,25 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19627,25 +19627,25 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19936,25 +19936,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20245,25 +20245,25 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20554,25 +20554,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20856,22 +20856,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21182,24 +21182,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21509,25 +21509,25 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21866,27 +21866,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22225,27 +22225,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22560,24 +22560,24 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22888,24 +22888,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23244,27 +23244,27 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23603,27 +23603,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23962,27 +23962,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24321,27 +24321,27 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24676,27 +24676,27 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25035,27 +25035,27 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25394,27 +25394,27 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25753,27 +25753,27 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
index 22c1b6f9fe875..faa970e049bd2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
@@ -107,16 +107,16 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
; GFX12-NEXT: flat_store_b32 v[0:1], v2
; GFX12-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_last_use_and_volatile_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_last_use_and_volatile_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
entry:
%val = load volatile i32, ptr %in, align 4, !amdgpu.last.use !{}
store i32 %val, ptr %out
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index c949790b97d72..721ecd8da5387 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -1322,16 +1322,16 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_nontemporal_volatile_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_nontemporal_volatile_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load volatile i32, ptr %in, align 4, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index 8a75db2c36dc7..635895259ee32 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -929,16 +929,16 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") unordered, align 4
@@ -1089,16 +1089,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") monotonic, align 4
@@ -1249,16 +1249,16 @@ define amdgpu_kernel void @flat_singlethread_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") release, align 4
@@ -1409,16 +1409,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") seq_cst, align 4
@@ -1569,16 +1569,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") monotonic
@@ -1729,16 +1729,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire
@@ -1889,16 +1889,16 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") release
@@ -2049,16 +2049,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel
@@ -2209,16 +2209,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst
@@ -2413,18 +2413,18 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire
@@ -2620,18 +2620,18 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel
@@ -2827,18 +2827,18 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst
@@ -3078,20 +3078,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3331,20 +3331,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3584,20 +3584,20 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3837,20 +3837,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4090,20 +4090,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4343,20 +4343,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4596,20 +4596,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4849,20 +4849,20 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5102,20 +5102,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5355,20 +5355,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5608,20 +5608,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5861,20 +5861,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6114,20 +6114,20 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6367,20 +6367,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6620,20 +6620,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6917,22 +6917,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7218,22 +7218,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7519,22 +7519,22 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7820,22 +7820,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8121,22 +8121,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8422,22 +8422,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8723,22 +8723,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9024,22 +9024,22 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9325,22 +9325,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9626,22 +9626,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9927,22 +9927,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10228,22 +10228,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10529,22 +10529,22 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10830,22 +10830,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11131,22 +11131,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12072,16 +12072,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") unordered, align 4
@@ -12232,16 +12232,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") monotonic, align 4
@@ -12392,16 +12392,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") release, align 4
@@ -12552,16 +12552,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -12712,16 +12712,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -12872,16 +12872,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -13032,16 +13032,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") release
@@ -13192,16 +13192,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -13352,16 +13352,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -13556,18 +13556,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -13763,18 +13763,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -13970,18 +13970,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -14221,20 +14221,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14474,20 +14474,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14727,20 +14727,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14980,20 +14980,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15233,20 +15233,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15486,20 +15486,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15739,20 +15739,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15992,20 +15992,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16245,20 +16245,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16498,20 +16498,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16751,20 +16751,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17004,20 +17004,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17257,20 +17257,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17510,20 +17510,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17763,20 +17763,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18060,22 +18060,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18361,22 +18361,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18662,22 +18662,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18963,22 +18963,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19264,22 +19264,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19565,22 +19565,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19866,22 +19866,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20167,22 +20167,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20468,22 +20468,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20769,22 +20769,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21070,22 +21070,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21371,22 +21371,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21672,22 +21672,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21973,22 +21973,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22274,22 +22274,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index b5ea23d4655b6..e45a8e51c836c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -829,19 +829,19 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in seq_cst, align 4
@@ -993,16 +993,16 @@ define amdgpu_kernel void @flat_system_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out unordered, align 4
@@ -1153,16 +1153,16 @@ define amdgpu_kernel void @flat_system_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out monotonic, align 4
@@ -1341,19 +1341,19 @@ define amdgpu_kernel void @flat_system_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out release, align 4
@@ -1532,19 +1532,19 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out seq_cst, align 4
@@ -1695,16 +1695,16 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in monotonic
@@ -1888,18 +1888,18 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acquire
@@ -2078,19 +2078,19 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in release
@@ -2302,21 +2302,21 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel
@@ -2528,21 +2528,21 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
@@ -2754,19 +2754,19 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acquire
@@ -3011,22 +3011,22 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel
@@ -3271,22 +3271,22 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst
@@ -3526,20 +3526,20 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3812,22 +3812,22 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4095,23 +4095,23 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4412,25 +4412,25 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4731,25 +4731,25 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5022,22 +5022,22 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5310,22 +5310,22 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5626,25 +5626,25 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5945,25 +5945,25 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6264,25 +6264,25 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6583,25 +6583,25 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6902,25 +6902,25 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7221,25 +7221,25 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7540,25 +7540,25 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7859,25 +7859,25 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8161,22 +8161,22 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8479,23 +8479,23 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8809,25 +8809,25 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9162,26 +9162,26 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9516,26 +9516,26 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9842,23 +9842,23 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10161,23 +10161,23 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10512,26 +10512,26 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10866,26 +10866,26 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11220,26 +11220,26 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11574,26 +11574,26 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11924,26 +11924,26 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12278,26 +12278,26 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12632,26 +12632,26 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12986,26 +12986,26 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -13852,20 +13852,20 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("one-as") seq_cst, align 4
@@ -14017,16 +14017,16 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") unordered, align 4
@@ -14177,16 +14177,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") monotonic, align 4
@@ -14365,19 +14365,19 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") release, align 4
@@ -14556,19 +14556,19 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("one-as") seq_cst, align 4
@@ -14719,16 +14719,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") monotonic
@@ -14908,18 +14908,18 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire
@@ -15098,19 +15098,19 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") release
@@ -15318,21 +15318,21 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel
@@ -15540,21 +15540,21 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst
@@ -15776,20 +15776,20 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire
@@ -16044,23 +16044,23 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel
@@ -16315,23 +16315,23 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst
@@ -16571,20 +16571,20 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16853,22 +16853,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17136,23 +17136,23 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17449,25 +17449,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17764,25 +17764,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18051,22 +18051,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18335,22 +18335,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18647,25 +18647,25 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18962,25 +18962,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19277,25 +19277,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19592,25 +19592,25 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19907,25 +19907,25 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20222,25 +20222,25 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20537,25 +20537,25 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20852,25 +20852,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21154,22 +21154,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21482,24 +21482,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21813,25 +21813,25 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22176,27 +22176,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22541,27 +22541,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22878,24 +22878,24 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23208,24 +23208,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23570,27 +23570,27 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -23935,27 +23935,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24300,27 +24300,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -24665,27 +24665,27 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25026,27 +25026,27 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25391,27 +25391,27 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -25756,27 +25756,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -26121,27 +26121,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index 68af003ba6353..41c5927cad4de 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -145,16 +145,16 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_nontemporal_load_0:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_nontemporal_load_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load volatile i32, ptr %in, align 4
@@ -420,20 +420,20 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_nontemporal_load_1:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff
-; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_nontemporal_load_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1140,18 +1140,18 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_volatile_workgroup_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_volatile_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic volatile i32 %in, ptr %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index a4804675fd3cf..041b3f51abc2f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -929,16 +929,16 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") unordered, align 4
@@ -1089,16 +1089,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") monotonic, align 4
@@ -1249,16 +1249,16 @@ define amdgpu_kernel void @flat_wavefront_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") release, align 4
@@ -1409,16 +1409,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront") seq_cst, align 4
@@ -1569,16 +1569,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") monotonic
@@ -1729,16 +1729,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire
@@ -1889,16 +1889,16 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") release
@@ -2049,16 +2049,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel
@@ -2209,16 +2209,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst
@@ -2413,18 +2413,18 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire
@@ -2620,18 +2620,18 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel
@@ -2827,18 +2827,18 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst
@@ -3078,20 +3078,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3331,20 +3331,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3584,20 +3584,20 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3837,20 +3837,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4090,20 +4090,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4343,20 +4343,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4596,20 +4596,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4849,20 +4849,20 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5102,20 +5102,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5355,20 +5355,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5608,20 +5608,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5861,20 +5861,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6114,20 +6114,20 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6367,20 +6367,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6620,20 +6620,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6917,22 +6917,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7218,22 +7218,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7519,22 +7519,22 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7820,22 +7820,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8121,22 +8121,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8422,22 +8422,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8723,22 +8723,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9024,22 +9024,22 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9325,22 +9325,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9626,22 +9626,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9927,22 +9927,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10228,22 +10228,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10529,22 +10529,22 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10830,22 +10830,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11131,22 +11131,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -12072,16 +12072,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") unordered, align 4
@@ -12232,16 +12232,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") monotonic, align 4
@@ -12392,16 +12392,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") release, align 4
@@ -12552,16 +12552,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -12712,16 +12712,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -12872,16 +12872,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -13032,16 +13032,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") release
@@ -13192,16 +13192,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -13352,16 +13352,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -13556,18 +13556,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -13763,18 +13763,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -13970,18 +13970,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -14221,20 +14221,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14474,20 +14474,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14727,20 +14727,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14980,20 +14980,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15233,20 +15233,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15486,20 +15486,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15739,20 +15739,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15992,20 +15992,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16245,20 +16245,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16498,20 +16498,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16751,20 +16751,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17004,20 +17004,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17257,20 +17257,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17510,20 +17510,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17763,20 +17763,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18060,22 +18060,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18361,22 +18361,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18662,22 +18662,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18963,22 +18963,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19264,22 +19264,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19565,22 +19565,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19866,22 +19866,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20167,22 +20167,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20468,22 +20468,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20769,22 +20769,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21070,22 +21070,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21371,22 +21371,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21672,22 +21672,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21973,22 +21973,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 01801637ce770..85ecab8128d2f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -811,18 +811,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3]
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup") seq_cst, align 4
@@ -974,16 +974,16 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") unordered, align 4
@@ -1134,16 +1134,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") monotonic, align 4
@@ -1311,18 +1311,18 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") release, align 4
@@ -1490,18 +1490,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup") seq_cst, align 4
@@ -1652,16 +1652,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") monotonic
@@ -1831,17 +1831,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
@@ -2009,18 +2009,18 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") release
@@ -2207,19 +2207,19 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
@@ -2406,19 +2406,19 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
@@ -2625,18 +2625,18 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
@@ -2863,20 +2863,20 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
@@ -3103,20 +3103,20 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
@@ -3356,20 +3356,20 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3628,21 +3628,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -3899,22 +3899,22 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4190,23 +4190,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4482,23 +4482,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -4757,21 +4757,21 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5030,21 +5030,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5320,23 +5320,23 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5612,23 +5612,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -5904,23 +5904,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6196,23 +6196,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6496,22 +6496,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -6809,22 +6809,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7127,24 +7127,24 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7461,24 +7461,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -7795,24 +7795,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8112,22 +8112,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8425,22 +8425,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -8757,24 +8757,24 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9091,24 +9091,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9425,24 +9425,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -9759,24 +9759,24 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10091,24 +10091,24 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10425,24 +10425,24 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -10759,24 +10759,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11093,24 +11093,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -11903,18 +11903,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3]
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -12066,16 +12066,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") unordered, align 4
@@ -12226,16 +12226,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") monotonic, align 4
@@ -12396,18 +12396,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") release, align 4
@@ -12568,18 +12568,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -12730,16 +12730,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -12900,17 +12900,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -13071,18 +13071,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") release
@@ -13253,19 +13253,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -13436,19 +13436,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -13651,18 +13651,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -13878,20 +13878,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -14107,20 +14107,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -14360,20 +14360,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14623,21 +14623,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -14887,22 +14887,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15162,23 +15162,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15438,23 +15438,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15704,21 +15704,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -15968,21 +15968,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16242,23 +16242,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16518,23 +16518,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -16794,23 +16794,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17070,23 +17070,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17346,23 +17346,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17622,23 +17622,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -17898,23 +17898,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18174,23 +18174,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18474,22 +18474,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -18783,22 +18783,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19094,24 +19094,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19417,24 +19417,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -19740,24 +19740,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20053,22 +20053,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20362,22 +20362,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -20683,24 +20683,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21006,24 +21006,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21329,24 +21329,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21652,24 +21652,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -21973,24 +21973,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22296,24 +22296,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22619,24 +22619,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
@@ -22942,24 +22942,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index ad163cefe57d4..5c2d8eb4f5ec0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -829,19 +829,19 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4
@@ -1000,16 +1000,16 @@ define amdgpu_kernel void @global_agent_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") unordered, align 4
@@ -1167,16 +1167,16 @@ define amdgpu_kernel void @global_agent_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 4
@@ -1359,19 +1359,19 @@ define amdgpu_kernel void @global_agent_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") release, align 4
@@ -1554,19 +1554,19 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 4
@@ -1722,16 +1722,16 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") monotonic
@@ -1916,18 +1916,18 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire
@@ -2108,19 +2108,19 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") release
@@ -2330,21 +2330,21 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel
@@ -2554,21 +2554,21 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
@@ -2768,19 +2768,19 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire
@@ -3010,22 +3010,22 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel
@@ -3255,22 +3255,22 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
@@ -3491,20 +3491,20 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3754,22 +3754,22 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4015,23 +4015,23 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4306,25 +4306,25 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4599,25 +4599,25 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4867,22 +4867,22 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5132,22 +5132,22 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5422,25 +5422,25 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5715,25 +5715,25 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6008,25 +6008,25 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6301,25 +6301,25 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6594,25 +6594,25 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6887,25 +6887,25 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7180,25 +7180,25 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7473,25 +7473,25 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7742,22 +7742,22 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8026,23 +8026,23 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8320,25 +8320,25 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8636,26 +8636,26 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8953,26 +8953,26 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9245,23 +9245,23 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9530,23 +9530,23 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9844,26 +9844,26 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10161,26 +10161,26 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10478,26 +10478,26 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10795,26 +10795,26 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11108,26 +11108,26 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11425,26 +11425,26 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11742,26 +11742,26 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -12059,26 +12059,26 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -12903,24 +12903,24 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
- ptr addrspace(1) %in, ptr addrspace(1) %out) {
-entry:
- %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") seq_cst, align 4
- store i32 %val, ptr addrspace(1) %out
- ret void
+; GFX1250-LABEL: global_agent_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
+ ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+ %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") seq_cst, align 4
+ store i32 %val, ptr addrspace(1) %out
+ ret void
}
define amdgpu_kernel void @global_agent_one_as_unordered_store(
@@ -13074,16 +13074,16 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") unordered, align 4
@@ -13241,16 +13241,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") monotonic, align 4
@@ -13433,19 +13433,19 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") release, align 4
@@ -13628,19 +13628,19 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") seq_cst, align 4
@@ -13796,16 +13796,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") monotonic
@@ -13990,18 +13990,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire
@@ -14182,19 +14182,19 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") release
@@ -14404,21 +14404,21 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -14628,21 +14628,21 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -14842,19 +14842,19 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire
@@ -15084,22 +15084,22 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -15329,22 +15329,22 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -15565,20 +15565,20 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15828,22 +15828,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16089,23 +16089,23 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16380,25 +16380,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16673,25 +16673,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16941,22 +16941,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17206,22 +17206,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17496,25 +17496,25 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17789,25 +17789,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18082,25 +18082,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18375,25 +18375,25 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18668,25 +18668,25 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18961,25 +18961,25 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19254,25 +19254,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19547,25 +19547,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19816,22 +19816,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20100,23 +20100,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20414,26 +20414,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20731,26 +20731,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21023,23 +21023,23 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21308,23 +21308,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21622,26 +21622,26 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21939,26 +21939,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22256,26 +22256,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22573,26 +22573,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22886,26 +22886,26 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -23203,26 +23203,26 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -23520,26 +23520,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -23837,26 +23837,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
index bda702156905a..ca7802d295e0b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
@@ -87,16 +87,16 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_last_use_and_volatile_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_last_use_and_volatile_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{}
store i32 %val, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 4575cbbfd839e..d74c230488ea2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -1105,16 +1105,16 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_nontemporal_volatile_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_nontemporal_volatile_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index 4f2ea4493560f..e7f7b1d196be7 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -945,16 +945,16 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") unordered, align 4
@@ -1112,16 +1112,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") monotonic, align 4
@@ -1279,16 +1279,16 @@ define amdgpu_kernel void @global_singlethread_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") release, align 4
@@ -1446,16 +1446,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") seq_cst, align 4
@@ -1611,16 +1611,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") monotonic
@@ -1776,16 +1776,16 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire
@@ -1941,16 +1941,16 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") release
@@ -2106,16 +2106,16 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel
@@ -2271,16 +2271,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst
@@ -2464,18 +2464,18 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire
@@ -2660,18 +2660,18 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel
@@ -2856,18 +2856,18 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst
@@ -3088,20 +3088,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3322,20 +3322,20 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3556,20 +3556,20 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3790,20 +3790,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4024,20 +4024,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4258,20 +4258,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4492,20 +4492,20 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4726,20 +4726,20 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4960,20 +4960,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5194,20 +5194,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5428,20 +5428,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5662,20 +5662,20 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5896,20 +5896,20 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6130,20 +6130,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6364,20 +6364,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6628,22 +6628,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6896,22 +6896,22 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7164,22 +7164,22 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7432,22 +7432,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7700,22 +7700,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7968,22 +7968,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8236,22 +8236,22 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8504,22 +8504,22 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8772,22 +8772,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9040,22 +9040,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9308,22 +9308,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9576,22 +9576,22 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9844,22 +9844,22 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10112,22 +10112,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10380,22 +10380,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11336,16 +11336,16 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") unordered, align 4
@@ -11503,16 +11503,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") monotonic, align 4
@@ -11670,16 +11670,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") release, align 4
@@ -11837,16 +11837,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -12002,16 +12002,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -12167,16 +12167,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -12332,16 +12332,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") release
@@ -12497,16 +12497,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -12662,16 +12662,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -12855,18 +12855,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -13051,18 +13051,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -13247,18 +13247,18 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -13479,20 +13479,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13713,20 +13713,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13947,20 +13947,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14181,20 +14181,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14415,20 +14415,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14649,20 +14649,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14883,20 +14883,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15117,20 +15117,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15351,20 +15351,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15585,20 +15585,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15819,20 +15819,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16053,20 +16053,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16287,20 +16287,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16521,20 +16521,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16755,20 +16755,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17019,22 +17019,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17287,22 +17287,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17555,22 +17555,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17823,22 +17823,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18091,22 +18091,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18359,22 +18359,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18627,22 +18627,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18895,22 +18895,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19163,22 +19163,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19431,22 +19431,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19699,22 +19699,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19967,22 +19967,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20235,22 +20235,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20503,22 +20503,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20771,22 +20771,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index c8a45deccb462..e7880a81800fd 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -833,19 +833,19 @@ define amdgpu_kernel void @global_system_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in seq_cst, align 4
@@ -1004,16 +1004,16 @@ define amdgpu_kernel void @global_system_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out unordered, align 4
@@ -1171,16 +1171,16 @@ define amdgpu_kernel void @global_system_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out monotonic, align 4
@@ -1367,19 +1367,19 @@ define amdgpu_kernel void @global_system_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out release, align 4
@@ -1566,19 +1566,19 @@ define amdgpu_kernel void @global_system_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4
@@ -1734,16 +1734,16 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in monotonic
@@ -1930,18 +1930,18 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire
@@ -2126,19 +2126,19 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in release
@@ -2354,21 +2354,21 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel
@@ -2584,21 +2584,21 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
@@ -2800,19 +2800,19 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire
@@ -3048,22 +3048,22 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel
@@ -3299,22 +3299,22 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
@@ -3535,20 +3535,20 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3800,22 +3800,22 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4065,23 +4065,23 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4362,25 +4362,25 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4661,25 +4661,25 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4931,22 +4931,22 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5198,22 +5198,22 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5494,25 +5494,25 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5793,25 +5793,25 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6092,25 +6092,25 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6391,25 +6391,25 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6660,22 +6660,22 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6946,23 +6946,23 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7266,26 +7266,26 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7589,26 +7589,26 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7883,23 +7883,23 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8170,23 +8170,23 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8490,26 +8490,26 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8813,26 +8813,26 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9136,26 +9136,26 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9459,26 +9459,26 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9778,26 +9778,26 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10101,26 +10101,26 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10424,26 +10424,26 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10747,26 +10747,26 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11595,19 +11595,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") seq_cst, align 4
@@ -11766,16 +11766,16 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") unordered, align 4
@@ -11933,16 +11933,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") monotonic, align 4
@@ -12129,19 +12129,19 @@ define amdgpu_kernel void @global_system_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") release, align 4
@@ -12328,19 +12328,19 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") seq_cst, align 4
@@ -12496,16 +12496,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") monotonic
@@ -12692,18 +12692,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire
@@ -12888,19 +12888,19 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") release
@@ -13116,21 +13116,21 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel
@@ -13346,21 +13346,21 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst
@@ -13562,19 +13562,19 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire
@@ -13810,22 +13810,22 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel
@@ -14061,22 +14061,22 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst
@@ -14297,20 +14297,20 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14562,22 +14562,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14827,23 +14827,23 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15124,25 +15124,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15423,25 +15423,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15693,22 +15693,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15960,22 +15960,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16256,25 +16256,25 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16555,25 +16555,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16854,25 +16854,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17153,25 +17153,25 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17452,25 +17452,25 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17751,25 +17751,25 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18050,25 +18050,25 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18349,25 +18349,25 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18618,22 +18618,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18904,23 +18904,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19202,25 +19202,25 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19524,26 +19524,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19847,26 +19847,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20141,23 +20141,23 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20428,23 +20428,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20748,26 +20748,26 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21071,26 +21071,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21394,26 +21394,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21717,26 +21717,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22036,26 +22036,26 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22359,26 +22359,26 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22682,26 +22682,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -23005,26 +23005,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index f4fdec7490117..3bf5ed8b2397f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -148,16 +148,16 @@ define amdgpu_kernel void @global_volatile_load_0(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_volatile_load_0:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_volatile_load_0:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load volatile i32, ptr addrspace(1) %in, align 4
@@ -355,20 +355,20 @@ define amdgpu_kernel void @global_volatile_load_1(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_volatile_load_1:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff
-; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_volatile_load_1:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
+; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1030,18 +1030,18 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_volatile_workgroup_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_volatile_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic volatile i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index f66e6d00e6eab..09eb062d876f6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -945,16 +945,16 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") unordered, align 4
@@ -1112,16 +1112,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 4
@@ -1279,16 +1279,16 @@ define amdgpu_kernel void @global_wavefront_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") release, align 4
@@ -1446,16 +1446,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") seq_cst, align 4
@@ -1611,16 +1611,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") monotonic
@@ -1776,16 +1776,16 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire
@@ -1941,16 +1941,16 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") release
@@ -2106,16 +2106,16 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel
@@ -2271,16 +2271,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst
@@ -2464,18 +2464,18 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire
@@ -2660,18 +2660,18 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel
@@ -2856,18 +2856,18 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst
@@ -3088,20 +3088,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3322,20 +3322,20 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3556,20 +3556,20 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3790,20 +3790,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4024,20 +4024,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4258,20 +4258,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4492,20 +4492,20 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4726,20 +4726,20 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4960,20 +4960,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5194,20 +5194,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5428,20 +5428,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5662,20 +5662,20 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5896,20 +5896,20 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6130,20 +6130,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6364,20 +6364,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6628,22 +6628,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6896,22 +6896,22 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7164,22 +7164,22 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7432,22 +7432,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7700,22 +7700,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7968,22 +7968,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8236,22 +8236,22 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8504,22 +8504,22 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8772,22 +8772,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9040,22 +9040,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9308,22 +9308,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9576,22 +9576,22 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9844,22 +9844,22 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10112,22 +10112,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10380,22 +10380,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11336,16 +11336,16 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") unordered, align 4
@@ -11503,16 +11503,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") monotonic, align 4
@@ -11670,16 +11670,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") release, align 4
@@ -11837,16 +11837,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -12002,16 +12002,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -12167,16 +12167,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -12332,16 +12332,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") release
@@ -12497,16 +12497,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -12662,16 +12662,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -12855,18 +12855,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -13051,18 +13051,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -13247,18 +13247,18 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -13479,20 +13479,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13713,20 +13713,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -13947,20 +13947,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14181,20 +14181,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14415,20 +14415,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14649,20 +14649,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14883,20 +14883,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15117,20 +15117,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15351,20 +15351,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15585,20 +15585,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15819,20 +15819,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16053,20 +16053,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16287,20 +16287,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16521,20 +16521,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16755,20 +16755,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17019,22 +17019,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17287,22 +17287,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17555,22 +17555,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17823,22 +17823,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18091,22 +18091,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18359,22 +18359,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18627,22 +18627,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18895,22 +18895,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19163,22 +19163,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19431,22 +19431,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19699,22 +19699,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19967,22 +19967,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20235,22 +20235,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20503,22 +20503,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20771,22 +20771,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index bbbf8cf7f5cb1..885edec03c2b6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -799,18 +799,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") seq_cst, align 4
@@ -969,16 +969,16 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") unordered, align 4
@@ -1136,16 +1136,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") monotonic, align 4
@@ -1321,18 +1321,18 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4
@@ -1508,18 +1508,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") seq_cst, align 4
@@ -1675,16 +1675,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") monotonic
@@ -1850,17 +1850,17 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire
@@ -2034,18 +2034,18 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") release
@@ -2229,19 +2229,19 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2425,19 +2425,19 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
@@ -2626,18 +2626,18 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire
@@ -2847,20 +2847,20 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel
@@ -3070,20 +3070,20 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
@@ -3304,20 +3304,20 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3548,21 +3548,21 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -3801,22 +3801,22 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4065,23 +4065,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4330,23 +4330,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4577,21 +4577,21 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -4822,21 +4822,21 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5085,23 +5085,23 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5350,23 +5350,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5615,23 +5615,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -5880,23 +5880,23 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6145,23 +6145,23 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6410,23 +6410,23 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6675,23 +6675,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -6940,23 +6940,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7207,22 +7207,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7480,22 +7480,22 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -7766,24 +7766,24 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8061,24 +8061,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8356,24 +8356,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8633,22 +8633,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -8906,22 +8906,22 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9199,24 +9199,24 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9494,24 +9494,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -9789,24 +9789,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10084,24 +10084,24 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10377,24 +10377,24 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10672,24 +10672,24 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -10967,24 +10967,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -11262,24 +11262,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -12070,18 +12070,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -12240,16 +12240,16 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_unordered_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_unordered_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") unordered, align 4
@@ -12407,16 +12407,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") monotonic, align 4
@@ -12584,18 +12584,18 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") release, align 4
@@ -12763,18 +12763,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -12930,16 +12930,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -13105,17 +13105,17 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acquire_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -13281,18 +13281,18 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") release
@@ -13468,19 +13468,19 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -13656,19 +13656,19 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -13857,18 +13857,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -14070,20 +14070,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -14285,20 +14285,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -14519,20 +14519,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -14763,21 +14763,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15008,22 +15008,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15264,23 +15264,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15521,23 +15521,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -15768,21 +15768,21 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16013,21 +16013,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16268,23 +16268,23 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16525,23 +16525,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -16782,23 +16782,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17039,23 +17039,23 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17296,23 +17296,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17553,23 +17553,23 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -17810,23 +17810,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18067,23 +18067,23 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18334,22 +18334,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18607,22 +18607,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -18885,24 +18885,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19172,24 +19172,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19459,24 +19459,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -19736,22 +19736,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20009,22 +20009,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20294,24 +20294,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20581,24 +20581,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -20868,24 +20868,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21155,24 +21155,24 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21440,24 +21440,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -21727,24 +21727,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22014,24 +22014,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
@@ -22301,24 +22301,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_xcnt 0x0
-; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
-; GFX1250-CU-NEXT: s_wait_loadcnt 0x0
-; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX1250-NEXT: v_mov_b32_e32 v3, v1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index 7428ddc780675..986b48b60a443 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -756,19 +756,19 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_load_b32 v1, v0
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("agent") seq_cst, align 4
@@ -1229,17 +1229,17 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") release, align 4
@@ -1399,17 +1399,17 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") seq_cst, align 4
@@ -1886,17 +1886,17 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") release
@@ -2072,18 +2072,18 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel
@@ -2259,18 +2259,18 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst
@@ -2678,20 +2678,20 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel
@@ -2899,20 +2899,20 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst
@@ -3474,19 +3474,19 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3689,20 +3689,20 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3905,20 +3905,20 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4513,20 +4513,20 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4729,20 +4729,20 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4945,20 +4945,20 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5161,20 +5161,20 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5377,20 +5377,20 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5593,20 +5593,20 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5809,20 +5809,20 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6025,20 +6025,20 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6724,22 +6724,22 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6975,22 +6975,22 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7226,22 +7226,22 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7939,22 +7939,22 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8190,22 +8190,22 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8441,22 +8441,22 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8692,22 +8692,22 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8943,22 +8943,22 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9194,22 +9194,22 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9445,22 +9445,22 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9696,22 +9696,22 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index d57736ba0230c..81bbe0a78203e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -756,19 +756,19 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_load_b32 v1, v0
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in seq_cst, align 4
@@ -1229,17 +1229,17 @@ define amdgpu_kernel void @local_system_release_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out release, align 4
@@ -1399,17 +1399,17 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out seq_cst, align 4
@@ -1886,17 +1886,17 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in release
@@ -2072,18 +2072,18 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel
@@ -2259,18 +2259,18 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst
@@ -2678,20 +2678,20 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel
@@ -2899,20 +2899,20 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst
@@ -3474,19 +3474,19 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3689,20 +3689,20 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3905,20 +3905,20 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4513,20 +4513,20 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4729,20 +4729,20 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4945,20 +4945,20 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5161,20 +5161,20 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5377,20 +5377,20 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5593,20 +5593,20 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5809,20 +5809,20 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6025,20 +6025,20 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6724,22 +6724,22 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6975,22 +6975,22 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7226,22 +7226,22 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7939,22 +7939,22 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8190,22 +8190,22 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8441,22 +8441,22 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8692,22 +8692,22 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8943,22 +8943,22 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9194,22 +9194,22 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9445,22 +9445,22 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9696,22 +9696,22 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index d8ba02adf4b35..980141a87ecf3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -883,17 +883,17 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_volatile_workgroup_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_volatile_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic volatile i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 7220c071bf657..6a233a2c9013b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -756,19 +756,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_load:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_load_b32 v1, v0
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_load:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %in, ptr addrspace(3) %out) {
entry:
%val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") seq_cst, align 4
@@ -1229,17 +1229,17 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_release_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_release_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4
@@ -1399,17 +1399,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_store:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_store:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(3) %out) {
entry:
store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") seq_cst, align 4
@@ -1886,17 +1886,17 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_release_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_release_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") release
@@ -2072,18 +2072,18 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acq_rel_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acq_rel_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2259,18 +2259,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s1
+; GFX1250-NEXT: v_mov_b32_e32 v1, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst
@@ -2678,20 +2678,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel
@@ -2899,20 +2899,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst
@@ -3474,19 +3474,19 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_release_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_release_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3689,20 +3689,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -3905,20 +3905,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4513,20 +4513,20 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_release_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_release_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4729,20 +4729,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -4945,20 +4945,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5161,20 +5161,20 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5377,20 +5377,20 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5593,20 +5593,20 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_release_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -5809,20 +5809,20 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6025,20 +6025,20 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s2
+; GFX1250-NEXT: v_mov_b32_e32 v1, s1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6724,22 +6724,22 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -6975,22 +6975,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7226,22 +7226,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -7939,22 +7939,22 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8190,22 +8190,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8441,22 +8441,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8692,22 +8692,22 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -8943,22 +8943,22 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9194,22 +9194,22 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9445,22 +9445,22 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
@@ -9696,22 +9696,22 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX12-CU-NEXT: ds_store_b32 v0, v1
; GFX12-CU-NEXT: s_endpgm
;
-; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
-; GFX1250-CU: ; %bb.0: ; %entry
-; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
-; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4
-; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
-; GFX1250-CU-NEXT: s_wait_kmcnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2
-; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-CU-NEXT: s_wait_storecnt 0x0
-; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
-; GFX1250-CU-NEXT: s_wait_dscnt 0x0
-; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0
-; GFX1250-CU-NEXT: ds_store_b32 v0, v1
-; GFX1250-CU-NEXT: s_endpgm
+; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4
+; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v0, s0
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
ptr addrspace(3) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(3) %out, i32 4
>From c6180d95965e41a440b7e6b92e0e8890000091b8 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Fri, 22 Aug 2025 10:12:03 +0200
Subject: [PATCH 4/5] Comments
---
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 7 ++++---
llvm/lib/Target/AMDGPU/SOPInstructions.td | 2 +-
2 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 0451b27bc81c5..ac6ad085f014e 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -609,7 +609,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
// GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
// the behavior is the same if assuming GFX12.0 in CU mode.
- assert(ST.hasGFX1250Insts() ? ST.isCuModeEnabled() : true);
+ assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
}
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
@@ -2630,14 +2630,15 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
const bool IsRMW = (MI.mayLoad() && MI.mayStore());
bool Changed = false;
- // GFX12.5 only: xcnt wait is needed before flat and global atomics stores/rmw
+ // GFX12.5 only: xcnt wait is needed before flat and global atomics
+ // stores/rmw.
if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
MachineBasicBlock &MBB = *MI.getParent();
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
Changed = true;
}
- // Remaining fixes do not apply to RMWs
+ // Remaining fixes do not apply to RMWs.
if (IsRMW)
return Changed;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index e170268b47c44..12a27db241c4e 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1655,7 +1655,7 @@ let OtherPredicates = [HasImageInsts] in {
let SubtargetPredicate = HasWaitXcnt in {
- def S_WAIT_XCNT_soft : SOPP_Pseudo<"s_soft_wait_xcnt", (ins s16imm:$simm16), "$simm16">;
+ def S_WAIT_XCNT_soft : SOPP_Pseudo<"", (ins s16imm:$simm16), "$simm16">;
}
// Represents the point at which a wave must wait for all outstanding direct loads to LDS.
>From 9567d51d848b886011d1ef5557303988583ceb0a Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 9 Sep 2025 10:29:04 +0200
Subject: [PATCH 5/5] Rebase, handle barrier patch
---
llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 3 +-
.../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 108 +++---
llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll | 4 +
.../llvm.amdgcn.cooperative.atomic-agent.ll | 12 +
...lvm.amdgcn.cooperative.atomic-workgroup.ll | 18 +
.../AMDGPU/memory-legalizer-barriers.ll | 9 +-
.../AMDGPU/memory-legalizer-private-agent.ll | 310 ++++++++----------
.../memory-legalizer-private-singlethread.ll | 84 +++++
.../AMDGPU/memory-legalizer-private-system.ll | 241 +++++---------
.../memory-legalizer-private-wavefront.ll | 84 +++++
.../memory-legalizer-private-workgroup.ll | 282 +++++++++++++---
11 files changed, 731 insertions(+), 424 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index ac6ad085f014e..c20fcacb8fb26 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2202,7 +2202,8 @@ bool SIGfx10CacheControl::insertBarrierStart(
// mode. This is because a CU mode release fence does not emit any wait, which
// is fine when only dealing with vmem, but isn't sufficient in the presence
// of barriers which do not go through vmem.
- if (!ST.isCuModeEnabled())
+ // GFX12.5 does not require this additional wait.
+ if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
return false;
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 62129ebe40358..1bf37d512f845 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -11859,7 +11859,9 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_4
; GFX1250-SDAG-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB110_5
; GFX1250-SDAG-NEXT: s_branch .LBB110_6
; GFX1250-SDAG-NEXT: .LBB110_3:
@@ -11873,7 +11875,6 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2
; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_add_f64_e32 v[4:5], v[2:3], v[0:1]
@@ -11884,12 +11885,12 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: .LBB110_7: ; %atomicrmw.shared
; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s0, -1
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1]
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: .LBB110_8: ; %atomicrmw.end
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
@@ -11915,7 +11916,9 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: .LBB110_3: ; %Flow
; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -11927,7 +11930,6 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2
; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_add_f64_e32 v[4:5], v[2:3], v[0:1]
@@ -11943,12 +11945,12 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared
; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1]
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: .LBB110_8: ; %atomicrmw.end
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
;
@@ -12085,7 +12087,9 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: ; %bb.4: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: .LBB111_5: ; %Flow
; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB111_7
@@ -12107,6 +12111,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s0, -1
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: ds_add_f64 v2, v[0:1]
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
@@ -12131,7 +12136,9 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: .LBB111_3: ; %Flow
; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -12160,6 +12167,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: ds_add_f64 v2, v[0:1]
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: .LBB111_8: ; %atomicrmw.phi
@@ -12278,20 +12286,19 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB112_2
; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_max_num_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB112_3
; GFX1250-SDAG-NEXT: s_branch .LBB112_4
; GFX1250-SDAG-NEXT: .LBB112_2:
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: .LBB112_3: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2
; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
@@ -12299,7 +12306,7 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[0:1]
; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: .LBB112_4: ; %atomicrmw.end
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
@@ -12319,10 +12326,10 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_max_num_f64 v[2:3], v2, v[0:1], s[0:1] offset:80 th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: .LBB112_2: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0
@@ -12333,7 +12340,6 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0
; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
@@ -12341,7 +12347,7 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[0:1]
; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: .LBB112_4: ; %atomicrmw.end
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
;
@@ -12438,8 +12444,9 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
; GFX1250-SDAG-NEXT: .LBB113_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_max_num_f64 v2, v[0:1], s[0:1]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB113_2
; GFX1250-SDAG-NEXT: .LBB113_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
@@ -12471,8 +12478,9 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_max_num_f64 v2, v[0:1], s[0:1] offset:80
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: .LBB113_2: ; %Flow
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1
@@ -12579,20 +12587,19 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB114_2
; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_min_num_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB114_3
; GFX1250-SDAG-NEXT: s_branch .LBB114_4
; GFX1250-SDAG-NEXT: .LBB114_2:
; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1250-SDAG-NEXT: .LBB114_3: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2
; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
@@ -12600,7 +12607,7 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1]
; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE
; GFX1250-SDAG-NEXT: .LBB114_4: ; %atomicrmw.end
-; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
@@ -12620,10 +12627,10 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_min_num_f64 v[2:3], v2, v[0:1], s[0:1] offset:80 th:TH_ATOMIC_RETURN
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: .LBB114_2: ; %Flow
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0
@@ -12634,7 +12641,6 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0
; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
@@ -12642,7 +12648,7 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1]
; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE
; GFX1250-GISEL-NEXT: .LBB114_4: ; %atomicrmw.end
-; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
;
@@ -12739,8 +12745,9 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
; GFX1250-SDAG-NEXT: .LBB115_3: ; %atomicrmw.global
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0
; GFX1250-SDAG-NEXT: flat_atomic_min_num_f64 v2, v[0:1], s[0:1]
-; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB115_2
; GFX1250-SDAG-NEXT: .LBB115_4: ; %atomicrmw.private
; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
@@ -12772,8 +12779,9 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) {
; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0
; GFX1250-GISEL-NEXT: flat_atomic_min_num_f64 v2, v[0:1], s[0:1] offset:80
-; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: .LBB115_2: ; %Flow
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1
@@ -12870,6 +12878,7 @@ define float @flat_atomic_fadd_f32_saddr_rtn(ptr inreg %ptr, float %data) {
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -12901,8 +12910,9 @@ define void @flat_atomic_fadd_f32_saddr_nortn(ptr inreg %ptr, float %data) {
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_add_f32 v1, v0, s[0:1] offset:40
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX950-SDAG-LABEL: flat_atomic_fadd_f32_saddr_nortn:
@@ -12932,6 +12942,7 @@ define float @flat_atomic_fmax_f32_saddr_rtn(ptr inreg %ptr, float %data) {
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_max_num_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -12994,8 +13005,9 @@ define void @flat_atomic_fmax_f32_saddr_nortn(ptr inreg %ptr, float %data) {
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_max_num_f32 v1, v0, s[0:1] offset:40
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX950-SDAG-LABEL: flat_atomic_fmax_f32_saddr_nortn:
@@ -13055,6 +13067,7 @@ define float @flat_atomic_fmin_f32_saddr_rtn(ptr inreg %ptr, float %data) {
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_min_num_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -13117,8 +13130,9 @@ define void @flat_atomic_fmin_f32_saddr_nortn(ptr inreg %ptr, float %data) {
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_min_num_f32 v1, v0, s[0:1] offset:40
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX950-SDAG-LABEL: flat_atomic_fmin_f32_saddr_nortn:
@@ -13178,6 +13192,7 @@ define <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_pk_add_f16 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -13209,8 +13224,9 @@ define void @flat_atomic_fadd_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_pk_add_f16 v1, v0, s[0:1] offset:40
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX950-SDAG-LABEL: flat_atomic_fadd_v2f16_saddr_nortn:
@@ -13251,11 +13267,13 @@ define <2 x half> @flat_atomic_fmax_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_pk_max_num_f16 v0, v5, v5
; GFX1250-NEXT: v_pk_max_num_f16 v4, v0, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1250-NEXT: s_cbranch_execnz .LBB124_1
; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13331,10 +13349,11 @@ define void @flat_atomic_fmax_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data
; GFX1250-NEXT: v_pk_max_num_f16 v0, v1, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_pk_max_num_f16 v0, v0, v3
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, v0
; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -13414,11 +13433,13 @@ define <2 x half> @flat_atomic_fmin_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_pk_max_num_f16 v0, v5, v5
; GFX1250-NEXT: v_pk_min_num_f16 v4, v0, v1
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1250-NEXT: s_cbranch_execnz .LBB126_1
; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13494,10 +13515,11 @@ define void @flat_atomic_fmin_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data
; GFX1250-NEXT: v_pk_max_num_f16 v0, v1, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_pk_min_num_f16 v0, v0, v3
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, v0
; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -13566,6 +13588,7 @@ define <2 x bfloat> @flat_atomic_fadd_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bflo
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_pk_add_bf16 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -13589,8 +13612,9 @@ define void @flat_atomic_fadd_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %d
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_atomic_pk_add_bf16 v1, v0, s[0:1] offset:40
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX950-LABEL: flat_atomic_fadd_v2bf16_saddr_nortn:
@@ -13620,11 +13644,13 @@ define <2 x bfloat> @flat_atomic_fmax_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bflo
; GFX1250-NEXT: v_mov_b32_e32 v5, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_pk_max_num_bf16 v4, v5, v0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1250-NEXT: s_cbranch_execnz .LBB130_1
; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13676,10 +13702,11 @@ define void @flat_atomic_fmax_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %d
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_pk_max_num_bf16 v2, v3, v0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v2, v1, v[2:3], s[0:1] offset:40 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v3, v2
; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -13734,11 +13761,13 @@ define <2 x bfloat> @flat_atomic_fmin_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bflo
; GFX1250-NEXT: v_mov_b32_e32 v5, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_pk_min_num_bf16 v4, v5, v0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
; GFX1250-NEXT: s_cbranch_execnz .LBB132_1
; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -13790,10 +13819,11 @@ define void @flat_atomic_fmin_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %d
; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_pk_min_num_bf16 v2, v3, v0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v2, v1, v[2:3], s[0:1] offset:40 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v3, v2
; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll
index 1e293c28ce397..ba761bedb905c 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll
@@ -38,6 +38,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -79,6 +80,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -189,6 +191,7 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -255,6 +258,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll
index e3ec4d1f0f67a..614a221d43d53 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-agent.ll
@@ -130,6 +130,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -144,6 +145,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -158,6 +160,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -220,6 +223,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -234,6 +238,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -248,6 +253,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -385,6 +391,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -399,6 +406,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -413,6 +421,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -475,6 +484,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -489,6 +499,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -503,6 +514,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll
index e86f0e0083805..2b04ab5ab8a00 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cooperative.atomic-workgroup.ll
@@ -124,6 +124,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr noundef
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -137,6 +138,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr noundef
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -150,6 +152,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr noundef
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -163,6 +166,7 @@ define i32 @test_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr noundef r
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -176,6 +180,7 @@ define <2 x i32> @test_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(ptr nou
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -189,6 +194,7 @@ define <4 x i32> @test_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(ptr nou
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -202,6 +208,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr noundef
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -215,6 +222,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr noundef
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -228,6 +236,7 @@ define void @test_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr noundef
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -358,6 +367,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_release(ptr
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -371,6 +381,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_release(ptr
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -384,6 +395,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_release(ptr
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -397,6 +409,7 @@ define i32 @test_one_as_flat_amdgcn_cooperative_atomic_load_32x4B_seq_cst(ptr no
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -410,6 +423,7 @@ define <2 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_16x8B_seq_cst(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -423,6 +437,7 @@ define <4 x i32> @test_one_as_flat_amdgcn_cooperative_atomic_load_8x16B_seq_cst(
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_load_b128 v[0:3], v[0:1] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -436,6 +451,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_32x4B_seq_cst(ptr
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -449,6 +465,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_16x8B_seq_cst(ptr
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -462,6 +479,7 @@ define void @test_one_as_flat_amdgcn_cooperative_atomic_store_8x16B_seq_cst(ptr
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_store_b128 v[0:1], v[2:5] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
index e921f581c00a7..516c3946f63dc 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
@@ -45,7 +45,6 @@ define amdgpu_kernel void @test_s_barrier() {
;
; GFX1250-LABEL: test_s_barrier:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_alu 0xffe3
; GFX1250-NEXT: s_barrier_signal -1
; GFX1250-NEXT: s_barrier_wait -1
; GFX1250-NEXT: s_endpgm
@@ -103,8 +102,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
;
; GFX1250-LABEL: test_s_barrier_workgroup_fence:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_wait_alu 0xffe3
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_barrier_signal -1
; GFX1250-NEXT: s_barrier_wait -1
; GFX1250-NEXT: s_endpgm
@@ -168,11 +167,9 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
;
; GFX1250-LABEL: test_s_barrier_agent_fence:
; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_alu 0xffe3
; GFX1250-NEXT: s_barrier_signal -1
; GFX1250-NEXT: s_barrier_wait -1
; GFX1250-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
index 4ca0cc92e09be..8ac3414da7354 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
@@ -804,13 +804,9 @@ define amdgpu_kernel void @private_agent_seq_cst_load(
; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -992,6 +988,7 @@ define amdgpu_kernel void @private_agent_unordered_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -1170,6 +1167,7 @@ define amdgpu_kernel void @private_agent_monotonic_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -1348,9 +1346,9 @@ define amdgpu_kernel void @private_agent_release_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
@@ -1530,9 +1528,9 @@ define amdgpu_kernel void @private_agent_seq_cst_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
@@ -1712,6 +1710,7 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -1890,6 +1889,7 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -2070,9 +2070,9 @@ define amdgpu_kernel void @private_agent_release_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
@@ -2252,9 +2252,9 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2436,9 +2436,9 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2674,6 +2674,7 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -2910,13 +2911,11 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -3152,13 +3151,11 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -3430,6 +3427,7 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -3699,6 +3697,7 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -3970,9 +3969,9 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
@@ -4243,9 +4242,9 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -4518,9 +4517,9 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -4793,6 +4792,7 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -5064,6 +5064,7 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -5335,9 +5336,9 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -5610,9 +5611,9 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -5885,9 +5886,9 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -6160,9 +6161,9 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -6435,9 +6436,9 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -6710,9 +6711,9 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -6985,9 +6986,9 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -7260,9 +7261,9 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -7563,6 +7564,7 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -7864,6 +7866,7 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -8166,9 +8169,9 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -8471,13 +8474,11 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -8779,13 +8780,11 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -9087,9 +9086,8 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -9391,6 +9389,7 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -9693,13 +9692,11 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -10001,13 +9998,11 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -10309,13 +10304,11 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -10617,13 +10610,11 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -10925,9 +10916,9 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11231,13 +11222,11 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -11539,13 +11528,11 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -11847,13 +11834,11 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -12658,13 +12643,9 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_load(
; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -12847,6 +12828,7 @@ define amdgpu_kernel void @private_agent_one_as_unordered_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -13025,6 +13007,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -13203,10 +13186,10 @@ define amdgpu_kernel void @private_agent_one_as_release_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -13385,10 +13368,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -13567,6 +13550,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13745,6 +13729,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -13925,10 +13910,10 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -14107,10 +14092,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -14291,10 +14276,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -14529,6 +14514,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -14766,13 +14752,11 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -15009,13 +14993,11 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -15288,6 +15270,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -15557,6 +15540,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -15828,10 +15812,10 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -16101,10 +16085,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -16376,10 +16360,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -16651,6 +16635,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -16922,6 +16907,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -17193,10 +17179,10 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -17468,10 +17454,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -17743,10 +17729,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -18018,10 +18004,10 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -18293,10 +18279,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -18568,10 +18554,10 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -18843,10 +18829,10 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -19118,10 +19104,10 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -19421,6 +19407,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -19722,6 +19709,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -20025,13 +20013,11 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -20334,13 +20320,11 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -20643,9 +20627,8 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -20948,6 +20931,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -21251,13 +21235,11 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -21560,13 +21542,11 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -21869,13 +21849,11 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -22178,13 +22156,11 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -22487,10 +22463,10 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -22794,13 +22770,11 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -23103,13 +23077,11 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -23412,13 +23384,11 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
index e9ee6b4925a13..f5ba70e454823 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
@@ -984,6 +984,7 @@ define amdgpu_kernel void @private_singlethread_unordered_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -1162,6 +1163,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -1340,6 +1342,7 @@ define amdgpu_kernel void @private_singlethread_release_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -1518,6 +1521,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -1696,6 +1700,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -1874,6 +1879,7 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2052,6 +2058,7 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2230,6 +2237,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2408,6 +2416,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2640,6 +2649,7 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -2875,6 +2885,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -3110,6 +3121,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -3381,6 +3393,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -3650,6 +3663,7 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -3919,6 +3933,7 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -4188,6 +4203,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -4457,6 +4473,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -4726,6 +4743,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -4995,6 +5013,7 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -5264,6 +5283,7 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -5533,6 +5553,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -5802,6 +5823,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -6071,6 +6093,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -6340,6 +6363,7 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -6609,6 +6633,7 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -6878,6 +6903,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7147,6 +7173,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7444,6 +7471,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -7745,6 +7773,7 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -8046,6 +8075,7 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -8347,6 +8377,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -8648,6 +8679,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -8949,6 +8981,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -9250,6 +9283,7 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -9551,6 +9585,7 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -9852,6 +9887,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -10153,6 +10189,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -10454,6 +10491,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -10755,6 +10793,7 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -11056,6 +11095,7 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -11357,6 +11397,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -11658,6 +11699,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -12641,6 +12683,7 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -12819,6 +12862,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -12997,6 +13041,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -13175,6 +13220,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -13353,6 +13399,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13531,6 +13578,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13709,6 +13757,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13887,6 +13936,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -14065,6 +14115,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -14297,6 +14348,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -14532,6 +14584,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -14767,6 +14820,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -15038,6 +15092,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -15307,6 +15362,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -15576,6 +15632,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -15845,6 +15902,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -16114,6 +16172,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -16383,6 +16442,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -16652,6 +16712,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -16921,6 +16982,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17190,6 +17252,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17459,6 +17522,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17728,6 +17792,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17997,6 +18062,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18266,6 +18332,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18535,6 +18602,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18804,6 +18872,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19101,6 +19170,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -19402,6 +19472,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -19703,6 +19774,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -20004,6 +20076,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -20305,6 +20378,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -20606,6 +20680,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -20907,6 +20982,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -21208,6 +21284,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -21509,6 +21586,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -21810,6 +21888,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -22111,6 +22190,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -22412,6 +22492,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -22713,6 +22794,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -23014,6 +23096,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -23315,6 +23398,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
index 24ec3a34c4e6e..1e2153f76bc03 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
@@ -804,13 +804,9 @@ define amdgpu_kernel void @private_system_seq_cst_load(
; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -992,6 +988,7 @@ define amdgpu_kernel void @private_system_unordered_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -1170,6 +1167,7 @@ define amdgpu_kernel void @private_system_monotonic_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -1349,9 +1347,8 @@ define amdgpu_kernel void @private_system_release_store(
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
@@ -1532,9 +1529,8 @@ define amdgpu_kernel void @private_system_seq_cst_store(
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
@@ -1714,6 +1710,7 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -1892,6 +1889,7 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -2073,9 +2071,8 @@ define amdgpu_kernel void @private_system_release_atomicrmw(
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
@@ -2256,9 +2253,8 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw(
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2441,9 +2437,8 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw(
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -2679,6 +2674,7 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -2916,13 +2912,10 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw(
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -3159,13 +3152,10 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw(
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -3437,6 +3427,7 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -3706,6 +3697,7 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -3978,9 +3970,8 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
@@ -4252,9 +4243,8 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -4528,9 +4518,8 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -4803,6 +4792,7 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -5074,6 +5064,7 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -5346,9 +5337,8 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -5622,9 +5612,8 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -5898,9 +5887,8 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -6174,9 +6162,8 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
@@ -6477,6 +6464,7 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -6778,6 +6766,7 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -7081,13 +7070,10 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -7390,13 +7376,10 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -7698,9 +7681,8 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -8002,6 +7984,7 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -8305,13 +8288,10 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -8614,13 +8594,10 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -8923,13 +8900,10 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -9232,13 +9206,10 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -9541,9 +9512,8 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -9848,13 +9818,10 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -10157,13 +10124,10 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -10466,13 +10430,10 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -11277,13 +11238,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_load(
; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_load_b32 v0, v[0:1] scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -11466,6 +11423,7 @@ define amdgpu_kernel void @private_system_one_as_unordered_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -11644,6 +11602,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -11823,10 +11782,9 @@ define amdgpu_kernel void @private_system_one_as_release_store(
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -12006,10 +11964,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_store(
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -12188,6 +12145,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -12366,6 +12324,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -12547,10 +12506,9 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw(
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -12730,10 +12688,9 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw(
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -12915,10 +12872,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw(
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -13153,6 +13109,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -13391,13 +13348,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw(
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -13635,13 +13589,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw(
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -13914,6 +13865,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -14183,6 +14135,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -14455,10 +14408,9 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -14729,10 +14681,9 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -15005,10 +14956,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -15280,6 +15230,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -15551,6 +15502,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -15823,10 +15775,9 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -16099,10 +16050,9 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -16375,10 +16325,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -16651,10 +16600,9 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -16927,10 +16875,9 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -17203,10 +17150,9 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -17479,10 +17425,9 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -17755,10 +17700,9 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -18058,6 +18002,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -18359,6 +18304,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -18663,10 +18609,9 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -18969,13 +18914,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -19279,13 +19221,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -19588,9 +19527,8 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -19893,6 +19831,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -20197,13 +20136,10 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -20507,13 +20443,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -20817,13 +20750,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -21127,13 +21057,10 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -21437,10 +21364,9 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -21745,13 +21671,10 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -22055,13 +21978,10 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_dscnt 0x0
@@ -22365,13 +22285,10 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT: s_wait_bvhcnt 0x0
-; GFX1250-NEXT: s_wait_samplecnt 0x0
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
index 8b2254412c0c8..28d9d5dacd9e3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
@@ -984,6 +984,7 @@ define amdgpu_kernel void @private_wavefront_unordered_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -1162,6 +1163,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -1340,6 +1342,7 @@ define amdgpu_kernel void @private_wavefront_release_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -1518,6 +1521,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -1696,6 +1700,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -1874,6 +1879,7 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2052,6 +2058,7 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2230,6 +2237,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2408,6 +2416,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2640,6 +2649,7 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -2875,6 +2885,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -3110,6 +3121,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -3381,6 +3393,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -3650,6 +3663,7 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -3919,6 +3933,7 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -4188,6 +4203,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -4457,6 +4473,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -4726,6 +4743,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -4995,6 +5013,7 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -5264,6 +5283,7 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -5533,6 +5553,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -5802,6 +5823,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -6071,6 +6093,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -6340,6 +6363,7 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -6609,6 +6633,7 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -6878,6 +6903,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7147,6 +7173,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -7444,6 +7471,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -7745,6 +7773,7 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -8046,6 +8075,7 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -8347,6 +8377,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -8648,6 +8679,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -8949,6 +8981,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -9250,6 +9283,7 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -9551,6 +9585,7 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -9852,6 +9887,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -10153,6 +10189,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -10454,6 +10491,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -10755,6 +10793,7 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -11056,6 +11095,7 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -11357,6 +11397,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -11658,6 +11699,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -12641,6 +12683,7 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -12819,6 +12862,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -12997,6 +13041,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -13175,6 +13220,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -13353,6 +13399,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13531,6 +13578,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13709,6 +13757,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13887,6 +13936,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -14065,6 +14115,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -14297,6 +14348,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -14532,6 +14584,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -14767,6 +14820,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -15038,6 +15092,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -15307,6 +15362,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -15576,6 +15632,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -15845,6 +15902,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -16114,6 +16172,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -16383,6 +16442,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -16652,6 +16712,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -16921,6 +16982,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17190,6 +17252,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17459,6 +17522,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17728,6 +17792,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -17997,6 +18062,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18266,6 +18332,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18535,6 +18602,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -18804,6 +18872,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -19101,6 +19170,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -19402,6 +19472,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -19703,6 +19774,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -20004,6 +20076,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -20305,6 +20378,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -20606,6 +20680,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -20907,6 +20982,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -21208,6 +21284,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -21509,6 +21586,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -21810,6 +21888,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -22111,6 +22190,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -22412,6 +22492,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -22713,6 +22794,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -23014,6 +23096,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -23315,6 +23398,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
index 127434c365f95..01b2f6835cf7b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
@@ -803,7 +803,8 @@ define amdgpu_kernel void @private_workgroup_seq_cst_load(
; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -985,6 +986,7 @@ define amdgpu_kernel void @private_workgroup_unordered_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -1163,6 +1165,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -1341,7 +1344,9 @@ define amdgpu_kernel void @private_workgroup_release_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -1520,7 +1525,9 @@ define amdgpu_kernel void @private_workgroup_seq_cst_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -1699,6 +1706,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -1877,8 +1885,9 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -2056,7 +2065,9 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -2235,9 +2246,11 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -2415,9 +2428,11 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -2649,6 +2664,7 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -2884,7 +2900,9 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -3120,7 +3138,9 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -3392,6 +3412,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -3661,8 +3682,9 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -3931,7 +3953,9 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -4201,9 +4225,11 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4472,9 +4498,11 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -4743,8 +4771,9 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5013,8 +5042,9 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5283,9 +5313,11 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5554,9 +5586,11 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -5825,9 +5859,11 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -6096,9 +6132,11 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -6367,9 +6405,11 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -6638,9 +6678,11 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -6909,9 +6951,11 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -7180,9 +7224,11 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -7479,6 +7525,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -7780,6 +7827,7 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -8081,7 +8129,9 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -8383,7 +8433,9 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -8685,7 +8737,9 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -8987,6 +9041,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -9288,6 +9343,7 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -9589,7 +9645,9 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -9891,7 +9949,9 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -10193,7 +10253,9 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -10495,7 +10557,9 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -10797,7 +10861,9 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -11099,7 +11165,9 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -11401,7 +11469,9 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -11703,7 +11773,9 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -12506,6 +12578,8 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load(
; GFX1250-NEXT: v_cndmask_b32_e64 v0, s1, v0, s2
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -12687,6 +12761,7 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -12865,6 +12940,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -13043,6 +13119,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -13221,6 +13300,9 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_store(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GFX1250-NEXT: s_endpgm
i32 %in, ptr addrspace(5) %out) {
@@ -13399,6 +13481,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13577,7 +13660,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -13755,6 +13840,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
@@ -13933,7 +14021,11 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -14111,7 +14203,11 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in) {
entry:
@@ -14343,6 +14439,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -14578,6 +14675,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -14813,6 +14913,9 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX1250-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v1, v2
; GFX1250-NEXT: v_mov_b32_e32 v2, s1
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_swap_b32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -15084,6 +15187,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -15353,7 +15457,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -15622,6 +15728,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
@@ -15891,7 +16000,11 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16160,7 +16273,11 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16429,7 +16546,9 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16698,7 +16817,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -16967,7 +17088,11 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -17236,7 +17361,11 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -17505,7 +17634,11 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -17774,7 +17907,11 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -18043,7 +18180,11 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -18312,7 +18453,11 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -18581,7 +18726,11 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -18850,7 +18999,11 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s0
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3]
+; GFX1250-NEXT: s_wait_storecnt 0x0
; GFX1250-NEXT: s_endpgm
ptr addrspace(5) %out, i32 %in, i32 %old) {
entry:
@@ -19147,6 +19300,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -19448,6 +19602,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -19749,6 +19904,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -20050,6 +20208,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -20351,6 +20512,9 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -20652,6 +20816,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -20953,6 +21118,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -21254,6 +21420,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -21555,6 +21724,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -21856,6 +22028,9 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -22157,6 +22332,9 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -22458,6 +22636,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -22759,6 +22940,9 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -23060,6 +23244,9 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
@@ -23361,6 +23548,9 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX1250-NEXT: v_mov_b32_e32 v4, s1
; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX1250-NEXT: v_mov_b32_e32 v3, v4
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SE
More information about the llvm-commits
mailing list